In [67]:
import pandas as pd
import seaborn as sns
from datetime import timedelta
from datetime import datetime

In [68]:
disease_df = pd.read_csv('disease_data.csv', index_col='Date')
weather_df = pd.read_csv('weather_data.csv', index_col='datetime')

In [69]:
weather_df.columns

Index(['name', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'preciptype', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
       'solarenergy', 'uvindex', 'severerisk', 'sunrise', 'sunset',
       'moonphase', 'conditions', 'description', 'icon', 'stations'],
      dtype='object')

In [70]:
selected_cols = ['temp', 'dew', 'humidity', 'precip']

In [71]:
reduced_weather_df = weather_df[selected_cols]

In [72]:
column_names = list(disease_df.columns)
for i in range(7):
    for col in reduced_weather_df.columns:
        column_names.append(f"day_{i}_{col}")
column_names.append('days_after_pruning')

In [73]:
data = []

In [74]:
def generate_week(date_str):
    date = datetime.strptime(date_str, "%Y-%m-%d")
    week = []
    for i in range(7):
        day = date - timedelta(days=i)
        week.append(day.strftime("%Y-%m-%d"))
    week.reverse()
    return week


In [75]:
for date in disease_df.index:
    try:
        drow = list(disease_df.loc[date])
        week = generate_week(date)
        for i in range(len(week)):
            row = reduced_weather_df.loc[week[i]]
            drow.extend(row)
        cur_date = datetime.strptime(date, '%Y-%m-%d')
        pruning_date = datetime.strptime(f'{cur_date.year}-09-25', '%Y-%m-%d')
        if pruning_date > cur_date:
            pruning_date = datetime.strptime(f'{cur_date.year - 1}-09-25', '%Y-%m-%d')
        diff = cur_date - pruning_date
        drow.append(diff.days)
        data.append(drow)
    except:
        pass

In [76]:
data_df = pd.DataFrame(data, columns=column_names)

In [78]:
data_df = data_df.drop(['Rust', 'Bacterial Spot', 'Bunch Rot'], axis=1)

In [79]:
data_df = data_df.dropna(subset=['Downy Mildew'])

In [83]:
data_df

Unnamed: 0,Downy Mildew,Powdery Mildew,Anthracnose,day_0_temp,day_0_dew,day_0_humidity,day_0_precip,day_1_temp,day_1_dew,day_1_humidity,...,day_4_precip,day_5_temp,day_5_dew,day_5_humidity,day_5_precip,day_6_temp,day_6_dew,day_6_humidity,day_6_precip,days_after_pruning
0,moderate,low,moderate,23.9,8.7,40.1,0.0,22.9,5.8,34.5,...,0.0,23.6,9.5,42.7,0.0,23.3,10.4,46.3,0.0,94
1,moderate,low,moderate,21.9,10.8,50.2,0.0,22.3,13.6,58.4,...,0.0,22.4,11.1,49.8,0.0,23.2,11.9,50.0,0.0,87
2,moderate,low,moderate,23.4,16.4,66.2,0.0,24.4,16.2,61.8,...,0.0,23.5,12.2,51.0,0.0,22.3,11.7,52.2,0.0,80
3,moderate,low,moderate,25.4,18.3,66.8,1.1,25.6,16.8,59.3,...,0.0,23.8,17.8,70.3,0.1,22.9,16.9,70.7,0.0,73
4,moderate,low,moderate,25.7,18.8,66.7,0.1,25.6,19.3,69.3,...,0.2,24.5,18.8,72.7,1.9,25.9,18.9,66.5,5.0,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,low,moderate,nil,26.8,16.8,56.1,0.0,26.8,15.5,51.0,...,0.0,23.6,11.2,48.0,0.0,24.4,11.0,45.4,0.0,51
270,low,nil,nil,25.7,20.6,75.8,8.0,26.0,18.0,64.6,...,0.0,26.7,19.6,66.8,0.3,26.0,19.9,70.8,0.5,30
272,low,nil,nil,27.2,19.9,65.7,0.0,27.6,18.0,57.7,...,0.0,26.5,20.5,73.2,0.0,27.6,20.1,68.9,0.0,16
280,moderate,moderate,high,25.1,21.7,81.9,0.2,24.4,21.6,84.9,...,6.0,24.1,21.7,86.5,5.0,23.0,21.5,91.5,1.0,325


In [84]:
data_df.to_csv('processed_data.csv', index=False)

### Dataset loading

In [16]:
import pandas as pd
import seaborn as sns

In [17]:
data_df = pd.read_csv('processed_data.csv')

### Train Test Split

In [18]:
X = data_df.drop(['Downy Mildew', 'Anthracnose', 'Powdery Mildew'], axis=1)
y1 = data_df['Downy Mildew']
y2 = data_df['Powdery Mildew']
y3 = data_df['Anthracnose']

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y1_train, y1_test, y2_train, y2_test, y3_train, y3_test = train_test_split(X, y1, y2, y3, random_state=40)

### Model Training

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

#### Random Forest Classifier

In [22]:
downy_clf = RandomForestClassifier(random_state=40)
powdery_clf = RandomForestClassifier(random_state=40)
anthra_clf = RandomForestClassifier(random_state=40)

In [23]:
downy_clf.fit(X_train, y1_train)
powdery_clf.fit(X_train, y2_train)
anthra_clf.fit(X_train, y3_train)

In [24]:
y1_pred = downy_clf.predict(X_test)
y2_pred = powdery_clf.predict(X_test)
y3_pred = anthra_clf.predict(X_test)

In [25]:
print(f"""
Downy Mildew:
    Accuracy: {accuracy_score(y1_test, y1_pred)}
    F1 Score: {f1_score(y1_test, y1_pred, average='micro')}
    Precision: {precision_score(y1_test, y1_pred, average='micro')}
    Recall: {recall_score(y1_test, y1_pred, average='micro')}
""")
print(f"""
Powdery Mildew:
    Accuracy: {accuracy_score(y2_test, y2_pred)}
    F1 Score: {f1_score(y2_test, y2_pred, average='micro')}
    Precision: {precision_score(y2_test, y2_pred, average='micro')}
    Recall: {recall_score(y2_test, y2_pred, average='micro')}
""")
print(f"""
Anthracnose:
    Accuracy: {accuracy_score(y3_test, y3_pred)}
    F1 Score: {f1_score(y3_test, y3_pred, average='micro')}
    Precision: {precision_score(y3_test, y3_pred, average='micro')}
    Recall: {recall_score(y3_test, y3_pred, average='micro')}
""")


Downy Mildew:
    Accuracy: 0.6206896551724138
    F1 Score: 0.6206896551724138
    Precision: 0.6206896551724138
    Recall: 0.6206896551724138


Powdery Mildew:
    Accuracy: 0.5862068965517241
    F1 Score: 0.5862068965517241
    Precision: 0.5862068965517241
    Recall: 0.5862068965517241


Anthracnose:
    Accuracy: 0.6551724137931034
    F1 Score: 0.6551724137931034
    Precision: 0.6551724137931034
    Recall: 0.6551724137931034



#### Decision Tree Classifier

In [26]:
downy_clf = DecisionTreeClassifier(random_state=40)
powdery_clf = DecisionTreeClassifier(random_state=40)
anthra_clf = DecisionTreeClassifier(random_state=40)

In [27]:
downy_clf.fit(X_train, y1_train)
powdery_clf.fit(X_train, y2_train)
anthra_clf.fit(X_train, y3_train)

In [28]:
y1_pred = downy_clf.predict(X_test)
y2_pred = powdery_clf.predict(X_test)
y3_pred = anthra_clf.predict(X_test)

In [29]:
print(f"""
Downy Mildew:
    Accuracy: {accuracy_score(y1_test, y1_pred)}
    F1 Score: {f1_score(y1_test, y1_pred, average='micro')}
    Precision: {precision_score(y1_test, y1_pred, average='micro')}
    Recall: {recall_score(y1_test, y1_pred, average='micro')}
""")
print(f"""
Powdery Mildew:
    Accuracy: {accuracy_score(y2_test, y2_pred)}
    F1 Score: {f1_score(y2_test, y2_pred, average='micro')}
    Precision: {precision_score(y2_test, y2_pred, average='micro')}
    Recall: {recall_score(y2_test, y2_pred, average='micro')}
""")
print(f"""
Anthracnose:
    Accuracy: {accuracy_score(y3_test, y3_pred)}
    F1 Score: {f1_score(y3_test, y3_pred, average='micro')}
    Precision: {precision_score(y3_test, y3_pred, average='micro')}
    Recall: {recall_score(y3_test, y3_pred, average='micro')}
""")


Downy Mildew:
    Accuracy: 0.6896551724137931
    F1 Score: 0.6896551724137931
    Precision: 0.6896551724137931
    Recall: 0.6896551724137931


Powdery Mildew:
    Accuracy: 0.5172413793103449
    F1 Score: 0.5172413793103449
    Precision: 0.5172413793103449
    Recall: 0.5172413793103449


Anthracnose:
    Accuracy: 0.5862068965517241
    F1 Score: 0.5862068965517241
    Precision: 0.5862068965517241
    Recall: 0.5862068965517241



#### Support Vector Classifier

In [30]:
downy_clf = SVC(random_state=40)
powdery_clf = SVC(random_state=40)
anthra_clf = SVC(random_state=40)

In [31]:
downy_clf.fit(X_train, y1_train)
powdery_clf.fit(X_train, y2_train)
anthra_clf.fit(X_train, y3_train)

In [32]:
y1_pred = downy_clf.predict(X_test)
y2_pred = powdery_clf.predict(X_test)
y3_pred = anthra_clf.predict(X_test)

In [33]:
print(f"""
Downy Mildew:
    Accuracy: {accuracy_score(y1_test, y1_pred)}
    F1 Score: {f1_score(y1_test, y1_pred, average='micro')}
    Precision: {precision_score(y1_test, y1_pred, average='micro')}
    Recall: {recall_score(y1_test, y1_pred, average='micro')}
""")
print(f"""
Powdery Mildew:
    Accuracy: {accuracy_score(y2_test, y2_pred)}
    F1 Score: {f1_score(y2_test, y2_pred, average='micro')}
    Precision: {precision_score(y2_test, y2_pred, average='micro')}
    Recall: {recall_score(y2_test, y2_pred, average='micro')}
""")
print(f"""
Anthracnose:
    Accuracy: {accuracy_score(y3_test, y3_pred)}
    F1 Score: {f1_score(y3_test, y3_pred, average='micro')}
    Precision: {precision_score(y3_test, y3_pred, average='micro')}
    Recall: {recall_score(y3_test, y3_pred, average='micro')}
""")


Downy Mildew:
    Accuracy: 0.5344827586206896
    F1 Score: 0.5344827586206896
    Precision: 0.5344827586206896
    Recall: 0.5344827586206896


Powdery Mildew:
    Accuracy: 0.3448275862068966
    F1 Score: 0.3448275862068966
    Precision: 0.3448275862068966
    Recall: 0.3448275862068966


Anthracnose:
    Accuracy: 0.6724137931034483
    F1 Score: 0.6724137931034483
    Precision: 0.6724137931034483
    Recall: 0.6724137931034483



#### Logistic Regression

In [34]:
downy_clf = LogisticRegression(random_state=40)
powdery_clf = LogisticRegression(random_state=40)
anthra_clf = LogisticRegression(random_state=40)

In [35]:
downy_clf.fit(X_train, y1_train)
powdery_clf.fit(X_train, y2_train)
anthra_clf.fit(X_train, y3_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [36]:
y1_pred = downy_clf.predict(X_test)
y2_pred = powdery_clf.predict(X_test)
y3_pred = anthra_clf.predict(X_test)

In [37]:
print(f"""
Downy Mildew:
    Accuracy: {accuracy_score(y1_test, y1_pred)}
    F1 Score: {f1_score(y1_test, y1_pred, average='micro')}
    Precision: {precision_score(y1_test, y1_pred, average='micro')}
    Recall: {recall_score(y1_test, y1_pred, average='micro')}
""")
print(f"""
Powdery Mildew:
    Accuracy: {accuracy_score(y2_test, y2_pred)}
    F1 Score: {f1_score(y2_test, y2_pred, average='micro')}
    Precision: {precision_score(y2_test, y2_pred, average='micro')}
    Recall: {recall_score(y2_test, y2_pred, average='micro')}
""")
print(f"""
Anthracnose:
    Accuracy: {accuracy_score(y3_test, y3_pred)}
    F1 Score: {f1_score(y3_test, y3_pred, average='micro')}
    Precision: {precision_score(y3_test, y3_pred, average='micro')}
    Recall: {recall_score(y3_test, y3_pred, average='micro')}
""")


Downy Mildew:
    Accuracy: 0.46551724137931033
    F1 Score: 0.46551724137931033
    Precision: 0.46551724137931033
    Recall: 0.46551724137931033


Powdery Mildew:
    Accuracy: 0.39655172413793105
    F1 Score: 0.39655172413793105
    Precision: 0.39655172413793105
    Recall: 0.39655172413793105


Anthracnose:
    Accuracy: 0.5344827586206896
    F1 Score: 0.5344827586206896
    Precision: 0.5344827586206896
    Recall: 0.5344827586206896



#### Generic Classification Evaluator

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier

In [39]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(y1)

In [40]:
encoder.transform(y1_test)

array([0, 0, 1, 4, 3, 4, 4, 4, 3, 1, 1, 3, 4, 1, 4, 3, 4, 1, 3, 4, 1, 1,
       3, 3, 1, 0, 4, 0, 1, 1, 4, 4, 4, 1, 4, 4, 3, 3, 4, 3, 1, 0, 1, 1,
       4, 3, 3, 4, 4, 4, 3, 1, 1, 1, 4, 3, 4, 0])

In [41]:
def get_report(classifier_algorithm, use_random_state=True):

    # Instantiate the models
    if use_random_state:
        downy_clf = classifier_algorithm(random_state=40)
        powdery_clf = classifier_algorithm(random_state=40)
        anthra_clf = classifier_algorithm(random_state=40)
    else:
        downy_clf = classifier_algorithm()
        powdery_clf = classifier_algorithm()
        anthra_clf = classifier_algorithm()

    # Train the models
    downy_clf.fit(X_train, y1_train)
    powdery_clf.fit(X_train, y2_train)
    anthra_clf.fit(X_train, y3_train)

    # Make Predictions
    y1_pred = downy_clf.predict(X_test)
    y2_pred = powdery_clf.predict(X_test)
    y3_pred = anthra_clf.predict(X_test)

    # Evaluate predictions
    print(f"""
    Downy Mildew:
        Accuracy: {accuracy_score(y1_test, y1_pred):.4f}
        F1 Score: {f1_score(y1_test, y1_pred, average='weighted'):.4f}
        Precision: {precision_score(y1_test, y1_pred, average='weighted'):.4f}
        Recall: {recall_score(y1_test, y1_pred, average='weighted'):.4f}
    """)
    print(f"""
    Powdery Mildew:
        Accuracy: {accuracy_score(y2_test, y2_pred):.4f}
        F1 Score: {f1_score(y2_test, y2_pred, average='weighted'):.4f}
        Precision: {precision_score(y2_test, y2_pred, average='weighted'):.4f}
        Recall: {recall_score(y2_test, y2_pred, average='weighted'):.4f}
    """)
    print(f"""
    Anthracnose:
        Accuracy: {accuracy_score(y3_test, y3_pred):.4f}
        F1 Score: {f1_score(y3_test, y3_pred, average='weighted'):.4f}
        Precision: {precision_score(y3_test, y3_pred, average='weighted'):.4f}
        Recall: {recall_score(y3_test, y3_pred, average='weighted'):.4f}
    """)
    return downy_clf, powdery_clf, anthra_clf

In [42]:
get_report(LogisticRegression)


    Downy Mildew:
        Accuracy: 0.4655
        F1 Score: 0.4354
        Precision: 0.4155
        Recall: 0.4655
    

    Powdery Mildew:
        Accuracy: 0.3966
        F1 Score: 0.4091
        Precision: 0.5051
        Recall: 0.3966
    

    Anthracnose:
        Accuracy: 0.5345
        F1 Score: 0.4817
        Precision: 0.4455
        Recall: 0.5345
    


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(LogisticRegression(random_state=40),
 LogisticRegression(random_state=40),
 LogisticRegression(random_state=40))

In [43]:
get_report(DecisionTreeClassifier)


    Downy Mildew:
        Accuracy: 0.6897
        F1 Score: 0.6721
        Precision: 0.7280
        Recall: 0.6897
    

    Powdery Mildew:
        Accuracy: 0.5172
        F1 Score: 0.5265
        Precision: 0.5532
        Recall: 0.5172
    

    Anthracnose:
        Accuracy: 0.5862
        F1 Score: 0.5905
        Precision: 0.6037
        Recall: 0.5862
    


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(DecisionTreeClassifier(random_state=40),
 DecisionTreeClassifier(random_state=40),
 DecisionTreeClassifier(random_state=40))

In [44]:
get_report(RandomForestClassifier)


    Downy Mildew:
        Accuracy: 0.6207
        F1 Score: 0.5724
        Precision: 0.5388
        Recall: 0.6207
    

    Powdery Mildew:
        Accuracy: 0.5862
        F1 Score: 0.5906
        Precision: 0.6377
        Recall: 0.5862
    

    Anthracnose:
        Accuracy: 0.6552
        F1 Score: 0.6385
        Precision: 0.6395
        Recall: 0.6552
    


  _warn_prf(average, modifier, msg_start, len(result))


(RandomForestClassifier(random_state=40),
 RandomForestClassifier(random_state=40),
 RandomForestClassifier(random_state=40))

In [45]:
get_report(SVC)


    Downy Mildew:
        Accuracy: 0.5345
        F1 Score: 0.5000
        Precision: 0.4742
        Recall: 0.5345
    

    Powdery Mildew:
        Accuracy: 0.3448
        F1 Score: 0.3426
        Precision: 0.3543
        Recall: 0.3448
    

    Anthracnose:
        Accuracy: 0.6724
        F1 Score: 0.5946
        Precision: 0.5363
        Recall: 0.6724
    


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(SVC(random_state=40), SVC(random_state=40), SVC(random_state=40))

In [46]:
get_report(KNeighborsClassifier, use_random_state=False)


    Downy Mildew:
        Accuracy: 0.6034
        F1 Score: 0.5762
        Precision: 0.6220
        Recall: 0.6034
    

    Powdery Mildew:
        Accuracy: 0.5000
        F1 Score: 0.4868
        Precision: 0.5325
        Recall: 0.5000
    

    Anthracnose:
        Accuracy: 0.6034
        F1 Score: 0.5845
        Precision: 0.5843
        Recall: 0.6034
    


(KNeighborsClassifier(), KNeighborsClassifier(), KNeighborsClassifier())

In [47]:
get_report(GradientBoostingClassifier)


    Downy Mildew:
        Accuracy: 0.7069
        F1 Score: 0.6543
        Precision: 0.6236
        Recall: 0.7069
    

    Powdery Mildew:
        Accuracy: 0.6034
        F1 Score: 0.5989
        Precision: 0.6109
        Recall: 0.6034
    

    Anthracnose:
        Accuracy: 0.7414
        F1 Score: 0.7320
        Precision: 0.7717
        Recall: 0.7414
    


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(GradientBoostingClassifier(random_state=40),
 GradientBoostingClassifier(random_state=40),
 GradientBoostingClassifier(random_state=40))

#### Saving the model

In [48]:
import pickle

In [49]:
downy_clf, powdery_clf, anthra_clf = get_report(GradientBoostingClassifier)


    Downy Mildew:
        Accuracy: 0.7069
        F1 Score: 0.6543
        Precision: 0.6236
        Recall: 0.7069
    

    Powdery Mildew:
        Accuracy: 0.6034
        F1 Score: 0.5989
        Precision: 0.6109
        Recall: 0.6034
    

    Anthracnose:
        Accuracy: 0.7414
        F1 Score: 0.7320
        Precision: 0.7717
        Recall: 0.7414
    


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
pickle.dump(downy_clf, open('../models/downy_mildew_classifier.sav', 'wb'))
pickle.dump(powdery_clf, open('../models/powdery_mildew_classifier.sav', 'wb'))
pickle.dump(anthra_clf, open('../models/anthracnose_classifier.sav', 'wb'))