In [1]:
import pandas as pd
import seaborn as sns
from datetime import timedelta
from datetime import datetime



In [2]:
disease_df = pd.read_csv('disease_data.csv', index_col='Date')
weather_df = pd.read_csv('weather_data.csv', index_col='datetime')

In [3]:
weather_df.columns

Index(['name', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'preciptype', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
       'solarenergy', 'uvindex', 'severerisk', 'sunrise', 'sunset',
       'moonphase', 'conditions', 'description', 'icon', 'stations'],
      dtype='object')

In [4]:
selected_cols = ['temp', 'dew', 'humidity', 'precip']

In [5]:
reduced_weather_df = weather_df[selected_cols]

In [6]:
column_names = list(disease_df.columns)
for i in range(7):
    for col in reduced_weather_df.columns:
        column_names.append(f"day_{i}_{col}")
column_names.append('days_after_pruning')

In [7]:
data = []

In [8]:
def generate_week(date_str):
    date = datetime.strptime(date_str, "%Y-%m-%d")
    week = []
    for i in range(7):
        day = date - timedelta(days=i)
        week.append(day.strftime("%Y-%m-%d"))
    week.reverse()
    return week


In [9]:
for date in disease_df.index:
    try:
        drow = list(disease_df.loc[date])
        week = generate_week(date)
        for i in range(len(week)):
            row = reduced_weather_df.loc[week[i]]
            drow.extend(row)
        cur_date = datetime.strptime(date, '%Y-%m-%d')
        pruning_date = datetime.strptime(f'{cur_date.year}-09-25', '%Y-%m-%d')
        if pruning_date > cur_date:
            pruning_date = datetime.strptime(f'{cur_date.year - 1}-09-25', '%Y-%m-%d')
        diff = cur_date - pruning_date
        drow.append(diff.days)
        data.append(drow)
    except:
        pass

In [10]:
data_df = pd.DataFrame(data, columns=column_names)

In [11]:
data_df = data_df.drop(['Rust', 'Bacterial Spot', 'Bunch Rot'], axis=1)

In [12]:
data_df = data_df.dropna(subset=['Downy Mildew'])

In [13]:
data_df

Unnamed: 0,Downy Mildew,Powdery Mildew,Anthracnose,day_0_temp,day_0_dew,day_0_humidity,day_0_precip,day_1_temp,day_1_dew,day_1_humidity,...,day_4_precip,day_5_temp,day_5_dew,day_5_humidity,day_5_precip,day_6_temp,day_6_dew,day_6_humidity,day_6_precip,days_after_pruning
0,moderate,low,moderate,23.9,8.7,40.1,0.0,22.9,5.8,34.5,...,0.0,23.6,9.5,42.7,0.0,23.3,10.4,46.3,0.0,94
1,moderate,low,moderate,21.9,10.8,50.2,0.0,22.3,13.6,58.4,...,0.0,22.4,11.1,49.8,0.0,23.2,11.9,50.0,0.0,87
2,moderate,low,moderate,23.4,16.4,66.2,0.0,24.4,16.2,61.8,...,0.0,23.5,12.2,51.0,0.0,22.3,11.7,52.2,0.0,80
3,moderate,low,moderate,25.4,18.3,66.8,1.1,25.6,16.8,59.3,...,0.0,23.8,17.8,70.3,0.1,22.9,16.9,70.7,0.0,73
4,moderate,low,moderate,25.7,18.8,66.7,0.1,25.6,19.3,69.3,...,0.2,24.5,18.8,72.7,1.9,25.9,18.9,66.5,5.0,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,nil,nil,low,28.0,21.9,70.6,0.0,27.6,22.1,73.2,...,4.4,28.1,22.4,72.8,4.0,26.5,22.5,80.0,43.0,251
135,nil,nil,nil,27.9,21.7,70.8,0.0,27.7,21.8,71.5,...,1.0,28.0,21.4,68.6,0.0,27.7,21.1,68.6,0.0,244
136,nil,nil,nil,30.0,21.3,61.8,0.5,28.8,21.8,67.1,...,3.4,26.7,22.5,78.1,0.6,28.2,21.1,67.5,0.0,237
137,nil,nil,nil,28.2,22.0,71.4,13.0,28.6,22.1,69.8,...,0.1,29.3,20.8,62.0,0.0,28.8,21.5,66.4,1.4,230


In [14]:
data_df.to_csv('processed_data.csv', index=False)

### Dataset loading

In [15]:
import pandas as pd
import seaborn as sns

In [16]:
data_df = pd.read_csv('processed_data.csv')

### Train Test Split

In [17]:
X = data_df.drop(['Downy Mildew', 'Anthracnose', 'Powdery Mildew'], axis=1)
y1 = data_df['Downy Mildew']
y2 = data_df['Powdery Mildew']
y3 = data_df['Anthracnose']

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y1_train, y1_test, y2_train, y2_test, y3_train, y3_test = train_test_split(X, y1, y2, y3, random_state=40)

### Model Training

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

#### Random Forest Classifier

In [21]:
downy_clf = RandomForestClassifier(random_state=40)
powdery_clf = RandomForestClassifier(random_state=40)
anthra_clf = RandomForestClassifier(random_state=40)

In [22]:
downy_clf.fit(X_train, y1_train)
powdery_clf.fit(X_train, y2_train)
anthra_clf.fit(X_train, y3_train)

In [23]:
y1_pred = downy_clf.predict(X_test)
y2_pred = powdery_clf.predict(X_test)
y3_pred = anthra_clf.predict(X_test)

In [24]:
print(f"""
Downy Mildew:
    Accuracy: {accuracy_score(y1_test, y1_pred)}
    F1 Score: {f1_score(y1_test, y1_pred, average='micro')}
    Precision: {precision_score(y1_test, y1_pred, average='micro')}
    Recall: {recall_score(y1_test, y1_pred, average='micro')}
""")
print(f"""
Powdery Mildew:
    Accuracy: {accuracy_score(y2_test, y2_pred)}
    F1 Score: {f1_score(y2_test, y2_pred, average='micro')}
    Precision: {precision_score(y2_test, y2_pred, average='micro')}
    Recall: {recall_score(y2_test, y2_pred, average='micro')}
""")
print(f"""
Anthracnose:
    Accuracy: {accuracy_score(y3_test, y3_pred)}
    F1 Score: {f1_score(y3_test, y3_pred, average='micro')}
    Precision: {precision_score(y3_test, y3_pred, average='micro')}
    Recall: {recall_score(y3_test, y3_pred, average='micro')}
""")


Downy Mildew:
    Accuracy: 0.6451612903225806
    F1 Score: 0.6451612903225806
    Precision: 0.6451612903225806
    Recall: 0.6451612903225806


Powdery Mildew:
    Accuracy: 0.4838709677419355
    F1 Score: 0.4838709677419355
    Precision: 0.4838709677419355
    Recall: 0.4838709677419355


Anthracnose:
    Accuracy: 0.7419354838709677
    F1 Score: 0.7419354838709677
    Precision: 0.7419354838709677
    Recall: 0.7419354838709677



#### Decision Tree Classifier

In [25]:
downy_clf = DecisionTreeClassifier(random_state=40)
powdery_clf = DecisionTreeClassifier(random_state=40)
anthra_clf = DecisionTreeClassifier(random_state=40)

In [26]:
downy_clf.fit(X_train, y1_train)
powdery_clf.fit(X_train, y2_train)
anthra_clf.fit(X_train, y3_train)

In [27]:
y1_pred = downy_clf.predict(X_test)
y2_pred = powdery_clf.predict(X_test)
y3_pred = anthra_clf.predict(X_test)

In [28]:
print(f"""
Downy Mildew:
    Accuracy: {accuracy_score(y1_test, y1_pred)}
    F1 Score: {f1_score(y1_test, y1_pred, average='micro')}
    Precision: {precision_score(y1_test, y1_pred, average='micro')}
    Recall: {recall_score(y1_test, y1_pred, average='micro')}
""")
print(f"""
Powdery Mildew:
    Accuracy: {accuracy_score(y2_test, y2_pred)}
    F1 Score: {f1_score(y2_test, y2_pred, average='micro')}
    Precision: {precision_score(y2_test, y2_pred, average='micro')}
    Recall: {recall_score(y2_test, y2_pred, average='micro')}
""")
print(f"""
Anthracnose:
    Accuracy: {accuracy_score(y3_test, y3_pred)}
    F1 Score: {f1_score(y3_test, y3_pred, average='micro')}
    Precision: {precision_score(y3_test, y3_pred, average='micro')}
    Recall: {recall_score(y3_test, y3_pred, average='micro')}
""")


Downy Mildew:
    Accuracy: 0.6451612903225806
    F1 Score: 0.6451612903225806
    Precision: 0.6451612903225806
    Recall: 0.6451612903225806


Powdery Mildew:
    Accuracy: 0.5806451612903226
    F1 Score: 0.5806451612903226
    Precision: 0.5806451612903226
    Recall: 0.5806451612903226


Anthracnose:
    Accuracy: 0.6129032258064516
    F1 Score: 0.6129032258064516
    Precision: 0.6129032258064516
    Recall: 0.6129032258064516



#### Support Vector Classifier

In [29]:
downy_clf = SVC(random_state=40)
powdery_clf = SVC(random_state=40)
anthra_clf = SVC(random_state=40)

In [30]:
downy_clf.fit(X_train, y1_train)
powdery_clf.fit(X_train, y2_train)
anthra_clf.fit(X_train, y3_train)

In [31]:
y1_pred = downy_clf.predict(X_test)
y2_pred = powdery_clf.predict(X_test)
y3_pred = anthra_clf.predict(X_test)

In [32]:
print(f"""
Downy Mildew:
    Accuracy: {accuracy_score(y1_test, y1_pred)}
    F1 Score: {f1_score(y1_test, y1_pred, average='micro')}
    Precision: {precision_score(y1_test, y1_pred, average='micro')}
    Recall: {recall_score(y1_test, y1_pred, average='micro')}
""")
print(f"""
Powdery Mildew:
    Accuracy: {accuracy_score(y2_test, y2_pred)}
    F1 Score: {f1_score(y2_test, y2_pred, average='micro')}
    Precision: {precision_score(y2_test, y2_pred, average='micro')}
    Recall: {recall_score(y2_test, y2_pred, average='micro')}
""")
print(f"""
Anthracnose:
    Accuracy: {accuracy_score(y3_test, y3_pred)}
    F1 Score: {f1_score(y3_test, y3_pred, average='micro')}
    Precision: {precision_score(y3_test, y3_pred, average='micro')}
    Recall: {recall_score(y3_test, y3_pred, average='micro')}
""")


Downy Mildew:
    Accuracy: 0.7741935483870968
    F1 Score: 0.7741935483870968
    Precision: 0.7741935483870968
    Recall: 0.7741935483870968


Powdery Mildew:
    Accuracy: 0.3870967741935484
    F1 Score: 0.3870967741935484
    Precision: 0.3870967741935484
    Recall: 0.3870967741935484


Anthracnose:
    Accuracy: 0.7419354838709677
    F1 Score: 0.7419354838709677
    Precision: 0.7419354838709677
    Recall: 0.7419354838709677



#### Logistic Regression

In [33]:
downy_clf = LogisticRegression(random_state=40)
powdery_clf = LogisticRegression(random_state=40)
anthra_clf = LogisticRegression(random_state=40)

In [34]:
downy_clf.fit(X_train, y1_train)
powdery_clf.fit(X_train, y2_train)
anthra_clf.fit(X_train, y3_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [35]:
y1_pred = downy_clf.predict(X_test)
y2_pred = powdery_clf.predict(X_test)
y3_pred = anthra_clf.predict(X_test)

In [36]:
print(f"""
Downy Mildew:
    Accuracy: {accuracy_score(y1_test, y1_pred)}
    F1 Score: {f1_score(y1_test, y1_pred, average='micro')}
    Precision: {precision_score(y1_test, y1_pred, average='micro')}
    Recall: {recall_score(y1_test, y1_pred, average='micro')}
""")
print(f"""
Powdery Mildew:
    Accuracy: {accuracy_score(y2_test, y2_pred)}
    F1 Score: {f1_score(y2_test, y2_pred, average='micro')}
    Precision: {precision_score(y2_test, y2_pred, average='micro')}
    Recall: {recall_score(y2_test, y2_pred, average='micro')}
""")
print(f"""
Anthracnose:
    Accuracy: {accuracy_score(y3_test, y3_pred)}
    F1 Score: {f1_score(y3_test, y3_pred, average='micro')}
    Precision: {precision_score(y3_test, y3_pred, average='micro')}
    Recall: {recall_score(y3_test, y3_pred, average='micro')}
""")


Downy Mildew:
    Accuracy: 0.5161290322580645
    F1 Score: 0.5161290322580645
    Precision: 0.5161290322580645
    Recall: 0.5161290322580645


Powdery Mildew:
    Accuracy: 0.4838709677419355
    F1 Score: 0.4838709677419355
    Precision: 0.4838709677419355
    Recall: 0.4838709677419355


Anthracnose:
    Accuracy: 0.5483870967741935
    F1 Score: 0.5483870967741935
    Precision: 0.5483870967741935
    Recall: 0.5483870967741935



#### Generic Classification Evaluator

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(y1)

In [39]:
encoder.transform(y1_test)

array([3, 1, 3, 1, 2, 2, 3, 3, 2, 2, 3, 1, 1, 1, 3, 3, 3, 3, 3, 1, 3, 2,
       3, 1, 2, 3, 3, 2, 2, 1, 1])

In [40]:
def get_report(classifier_algorithm, use_random_state=True):

    # Instantiate the models
    if use_random_state:
        downy_clf = classifier_algorithm(random_state=40)
        powdery_clf = classifier_algorithm(random_state=40)
        anthra_clf = classifier_algorithm(random_state=40)
    else:
        downy_clf = classifier_algorithm()
        powdery_clf = classifier_algorithm()
        anthra_clf = classifier_algorithm()

    # Train the models
    downy_clf.fit(X_train, y1_train)
    powdery_clf.fit(X_train, y2_train)
    anthra_clf.fit(X_train, y3_train)

    # Make Predictions
    y1_pred = downy_clf.predict(X_test)
    y2_pred = powdery_clf.predict(X_test)
    y3_pred = anthra_clf.predict(X_test)

    # Evaluate predictions
    print(f"""
    Downy Mildew:
        Accuracy: {accuracy_score(y1_test, y1_pred):.4f}
        F1 Score: {f1_score(y1_test, y1_pred, average='weighted'):.4f}
        Precision: {precision_score(y1_test, y1_pred, average='weighted'):.4f}
        Recall: {recall_score(y1_test, y1_pred, average='weighted'):.4f}
    """)
    print(f"""
    Powdery Mildew:
        Accuracy: {accuracy_score(y2_test, y2_pred):.4f}
        F1 Score: {f1_score(y2_test, y2_pred, average='weighted'):.4f}
        Precision: {precision_score(y2_test, y2_pred, average='weighted'):.4f}
        Recall: {recall_score(y2_test, y2_pred, average='weighted'):.4f}
    """)
    print(f"""
    Anthracnose:
        Accuracy: {accuracy_score(y3_test, y3_pred):.4f}
        F1 Score: {f1_score(y3_test, y3_pred, average='weighted'):.4f}
        Precision: {precision_score(y3_test, y3_pred, average='weighted'):.4f}
        Recall: {recall_score(y3_test, y3_pred, average='weighted'):.4f}
    """)
    return downy_clf, powdery_clf, anthra_clf

In [41]:
get_report(LogisticRegression)


    Downy Mildew:
        Accuracy: 0.5161
        F1 Score: 0.5171
        Precision: 0.5249
        Recall: 0.5161
    

    Powdery Mildew:
        Accuracy: 0.4839
        F1 Score: 0.4966
        Precision: 0.5172
        Recall: 0.4839
    

    Anthracnose:
        Accuracy: 0.5484
        F1 Score: 0.5745
        Precision: 0.6137
        Recall: 0.5484
    


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(LogisticRegression(random_state=40),
 LogisticRegression(random_state=40),
 LogisticRegression(random_state=40))

In [42]:
get_report(DecisionTreeClassifier)


    Downy Mildew:
        Accuracy: 0.6452
        F1 Score: 0.6548
        Precision: 0.6836
        Recall: 0.6452
    

    Powdery Mildew:
        Accuracy: 0.5806
        F1 Score: 0.5716
        Precision: 0.5797
        Recall: 0.5806
    

    Anthracnose:
        Accuracy: 0.6129
        F1 Score: 0.6073
        Precision: 0.6434
        Recall: 0.6129
    


(DecisionTreeClassifier(random_state=40),
 DecisionTreeClassifier(random_state=40),
 DecisionTreeClassifier(random_state=40))

In [43]:
get_report(RandomForestClassifier)


    Downy Mildew:
        Accuracy: 0.6452
        F1 Score: 0.6332
        Precision: 0.6387
        Recall: 0.6452
    

    Powdery Mildew:
        Accuracy: 0.4839
        F1 Score: 0.4835
        Precision: 0.4931
        Recall: 0.4839
    

    Anthracnose:
        Accuracy: 0.7419
        F1 Score: 0.7150
        Precision: 0.7928
        Recall: 0.7419
    


(RandomForestClassifier(random_state=40),
 RandomForestClassifier(random_state=40),
 RandomForestClassifier(random_state=40))

In [44]:
get_report(SVC)


    Downy Mildew:
        Accuracy: 0.7742
        F1 Score: 0.7742
        Precision: 0.7742
        Recall: 0.7742
    

    Powdery Mildew:
        Accuracy: 0.3871
        F1 Score: 0.3868
        Precision: 0.4264
        Recall: 0.3871
    

    Anthracnose:
        Accuracy: 0.7419
        F1 Score: 0.6694
        Precision: 0.6197
        Recall: 0.7419
    


  _warn_prf(average, modifier, msg_start, len(result))


(SVC(random_state=40), SVC(random_state=40), SVC(random_state=40))

In [45]:
get_report(KNeighborsClassifier, use_random_state=False)


    Downy Mildew:
        Accuracy: 0.7097
        F1 Score: 0.6972
        Precision: 0.7548
        Recall: 0.7097
    

    Powdery Mildew:
        Accuracy: 0.5806
        F1 Score: 0.5746
        Precision: 0.5888
        Recall: 0.5806
    

    Anthracnose:
        Accuracy: 0.6774
        F1 Score: 0.6955
        Precision: 0.7231
        Recall: 0.6774
    


  _warn_prf(average, modifier, msg_start, len(result))


(KNeighborsClassifier(), KNeighborsClassifier(), KNeighborsClassifier())

In [46]:
get_report(GradientBoostingClassifier)


    Downy Mildew:
        Accuracy: 0.6129
        F1 Score: 0.6046
        Precision: 0.6097
        Recall: 0.6129
    

    Powdery Mildew:
        Accuracy: 0.4839
        F1 Score: 0.4790
        Precision: 0.4753
        Recall: 0.4839
    

    Anthracnose:
        Accuracy: 0.7419
        F1 Score: 0.7372
        Precision: 0.7350
        Recall: 0.7419
    


(GradientBoostingClassifier(random_state=40),
 GradientBoostingClassifier(random_state=40),
 GradientBoostingClassifier(random_state=40))

#### Saving the model

In [47]:
import pickle

In [48]:
downy_clf, powdery_clf, anthra_clf = get_report(GradientBoostingClassifier)


    Downy Mildew:
        Accuracy: 0.6129
        F1 Score: 0.6046
        Precision: 0.6097
        Recall: 0.6129
    

    Powdery Mildew:
        Accuracy: 0.4839
        F1 Score: 0.4790
        Precision: 0.4753
        Recall: 0.4839
    

    Anthracnose:
        Accuracy: 0.7419
        F1 Score: 0.7372
        Precision: 0.7350
        Recall: 0.7419
    


In [49]:
pickle.dump(downy_clf, open('../models/downy_mildew_classifier.sav', 'wb'))
pickle.dump(powdery_clf, open('../models/powdery_mildew_classifier.sav', 'wb'))
pickle.dump(anthra_clf, open('../models/anthracnose_classifier.sav', 'wb'))