In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from termcolor import colored
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from imblearn.under_sampling import RandomUnderSampler


In [None]:
#za stablo ne treba transformisati jer nam je svejedno za granice

In [None]:
df = pd.read_csv('../../dataset/weatherClean.csv')

In [None]:
df.describe(include='all')


In [None]:
df.shape

In [None]:
df.isna().any()

In [None]:
Y = df['RainTomorrow']
X = df.drop('RainTomorrow', axis=1)
feature_names = X.columns
X.head()

In [None]:
X.shape


In [None]:
#nemamo balansirane rezultate, pa moramo da koristimo stratifikovanu podelu
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, stratify=Y, random_state=41)

print(X_train.shape)
print(X_test.shape)

Outlier

In [None]:
def col_plot(df,col_name):
    plt.figure(figsize=(20,6))
    
    plt.subplot(141) 
    plt.hist(df[col_name], bins = 20)
    f=lambda x:(np.sqrt(x) if x>=0 else -np.sqrt(-x))
    
    plt.axvline(x=df[col_name].mean() + 3*df[col_name].std(),color='red')
    plt.axvline(x=df[col_name].mean() - 3*df[col_name].std(),color='red')
    plt.xlabel(col_name)
    plt.tight_layout
    plt.xlabel("Histogram")
    plt.ylabel(col_name)

    plt.subplot(142)
    plt.boxplot(df[col_name])
    plt.xlabel("IQR=1.5")


    plt.show()


In [None]:
for i in X.columns:
    col_plot(X,i)

In [None]:
def find_boundaries(df, name):
    
    Q1=df[name].quantile(0.25)
    Q3=df[name].quantile(0.75)
    IQR=Q3-Q1
    lower_boundary = Q1-1.5*IQR
    upper_boundary = Q3+1.5*IQR
    return lower_boundary, upper_boundary


In [None]:
def outliers(name,df):
    lower_rainfall, upper_rainfall = find_boundaries(df, name)
    print('Gornja granica', upper_rainfall)
    print('Donja granica', lower_rainfall)
    outliers_rainfall = np.where(df[name] > upper_rainfall, True,
                            np.where(df[name] < lower_rainfall, True, False))

    print(outliers_rainfall)
    return len(df[outliers_rainfall]), outliers_rainfall



In [None]:
l, out = outliers('Evaporation',X_train)
X_train = X_train[~(out)]
Y_train = Y_train[~(out)]
print(len(df))

l, out = outliers('WindSpeed9am',X_train)
X_train = X_train[~(out)]
Y_train = Y_train[~(out)]
print(len(df))

In [None]:
X_train.shape

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
Y_train.hist(xlabelsize=15,bins=3,legend=True)
        

Model

In [None]:
#KOD PREUZET SA VEZBI
def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
        
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=['No', 'Yes'], index=['No', 'Yes']))
    print("---------------------------------------------------------------------------------")
    
    cm_matrix = pd.DataFrame(confusion_matrix(y, y_pred), columns=['Actual Negative:0', 'Actual Positive:1'], 
                                 index=['Predict Negative:0', 'Predict Positive:1'])

    sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
    

def plot_decision_tree(model, feature_names):
    plt.figure(figsize=(7, 7))
    plot_tree(model, class_names=['Yes', 'No'], feature_names=feature_names, filled=True, fontsize=10)
    plt.title("Decision tree of depth {} with {} nodes".format(model.get_depth(), model.get_n_leaves()))

    plt.show()
    print("---------------------------------------------------------------------------------")
    print(colored("Parameters of model {}".format(type(model).__name__), "green"))
    for k, v in model.get_params().items():
        print(colored(k, 'blue'), v)
    
    print("---------------------------------------------------------------------------------")


Model stabla bez podesavanja hiperparametara

In [None]:
tree = DecisionTreeClassifier(class_weight='balanced')
#bez balansiranja acc = 0.64

In [None]:
tree.fit(X_train, Y_train) # treninranje modela
tree.get_depth()

In [None]:
report(tree, X_train, Y_train)

In [None]:
pd.Series(tree.feature_importances_, index=X.columns).plot.barh()
plt.title("Feature importance")
plt.show()

In [None]:
report(tree, X_test, Y_test,text = "test")


In [None]:
def imbalanced_fit(X_train, y_train, X_test, y_test, sampler):
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    print(f'X_resampled.shape={X_resampled.shape}')
    print(f'y value counts = {y_resampled.value_counts()}')
    
    
    model = DecisionTreeClassifier()
    model.fit(X_resampled, y_resampled)
    
    report(model, X_test,y_test)
    
    return model

In [None]:
tree_under = imbalanced_fit(X_train, Y_train, X_test, Y_test,RandomUnderSampler())



Podesavanje hiper parametara

In [None]:
params = {'criterion': ['gini', 'entropy'],
        'max_depth': [3, 7, 11],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 3, 5],
        'random_state': [42]
         }

In [None]:
estimator = GridSearchCV(DecisionTreeClassifier(class_weight='balanced'), param_grid=params, cv=3) 

In [None]:
estimator.fit(X_train, Y_train)


In [None]:
estimator.best_params_


In [None]:
estimator.best_score_


In [None]:
report(estimator.best_estimator_, X_train, Y_train)


In [None]:
report(estimator.best_estimator_, X_test, Y_test, "test")


In [None]:
pd.Series(estimator.best_estimator_.feature_importances_, index=X.columns).plot.barh()
plt.title("Feature importance")
plt.show()

In [None]:
plot_decision_tree(estimator.best_estimator_, feature_names)


Slucajne sume

In [None]:
Forest_model = RandomForestClassifier(random_state=42,class_weight='balanced')


In [None]:
#param_grid = {
#    'n_estimators': [100, 200, 300],
#    'max_depth': [None, 5, 10],
#'min_samples_split': [2, 5, 10],
#    'min_samples_leaf': [1, 2, 4]
#}

In [None]:
#grid_search_RF = GridSearchCV(Forest_model, param_grid, scoring='accuracy', cv=5)


In [None]:
Forest_model.fit(X_train, Y_train)
#grid_search_RF.fit(X_train,Y_train)

In [None]:
report(Forest_model, X_test, Y_test, "test")


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
models = [tree, estimator.best_estimator_, Forest_model]
model_names = ['DecisionTree', 'GridSearchCV', 'RandomForest']
for model, model_name in zip(models, model_names):
    y_pred = model.predict(X_test)
    fpr,tpr, _ = roc_curve(Y_test, y_pred)
    auc = roc_auc_score(Y_test, y_pred)
    lab = model_name + "(auc: " + str(round(auc, 3)) + ")"
    plt.plot(fpr, tpr, label=lab)

plt.plot([0, 1], [0, 1], label='Random (auc: 0.5)', color='red')
plt.title("Poređenje modela")
plt.legend()
plt.show()


In [None]:
#losa tacnost, popraviti

In [None]:
with open('RandomTreeModel.pkl', 'wb') as file:  
    joblib.dump(tree, file)

with open('RandomTreeModel.pkl', 'rb') as file:
    # Call load method to deserialze
    loaded_model = joblib.load(file)


In [None]:
with open('BestEstimator.pkl', 'wb') as file:  
    joblib.dump(estimator.best_estimator_, file)

with open('BestEstimator.pkl', 'rb') as file:
    # Call load method to deserialze
    loaded_model = joblib.load(file)

In [None]:
with open('Forrest.pkl', 'wb') as file:  
    joblib.dump(Forest_model, file)

with open('Forrest.pkl', 'rb') as file:
    # Call load method to deserialze
    loaded_model = joblib.load(file)