In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
def split(df,label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.25, random_state=42)
    return X_tr, X_te, Y_tr, Y_te

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

In [None]:
classifiers = ['LinearSVM', 'RadialSVM',
               'Logistic',  'RandomForest',
               'AdaBoost',  'DecisionTree',
               'KNeighbors','GradientBoosting']

models = [svm.SVC(kernel='linear'),
          svm.SVC(kernel='rbf'),
          LogisticRegression(max_iter = 1000),
          RandomForestClassifier(n_estimators=200, random_state=0),
          AdaBoostClassifier(random_state = 0),
          DecisionTreeClassifier(random_state=0),
          KNeighborsClassifier(),
          GradientBoostingClassifier(random_state=0)]


In [None]:
def acc_score(df,label):
    Score = pd.DataFrame({"Classifier":classifiers})
    j = 0
    acc = []
    X_train,X_test,Y_train,Y_test = split(df,label)
    for i in models:
        model = i
        model.fit(X_train,Y_train)
        predictions = model.predict(X_test)
        acc.append(accuracy_score(Y_test,predictions))
        j = j+1
    Score["Accuracy"] = acc
    Score.sort_values(by="Accuracy", ascending=False,inplace = True)
    Score.reset_index(drop=True, inplace=True)
    return Score

In [None]:
def plot(score,x,y,c = "b"):
    gen = [1,2,3,4,5]
    plt.figure(figsize=(6,4))
    ax = sns.pointplot(x=gen, y=score,color = c )
    ax.set(xlabel="Generation", ylabel="Accuracy")
    ax.set(ylim=(x,y))

In [None]:
def initilization_of_population(size,n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat,dtype=np.bool)
        chromosome[:int(0.3*n_feat)]=False
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population

In [None]:
def selection(pop_after_fit,n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen


def crossover(pop_after_sel):
    pop_nextgen = pop_after_sel
    for i in range(0,len(pop_after_sel),2):
        new_par = []
        child_1 , child_2 = pop_nextgen[i] , pop_nextgen[i+1]
        new_par = np.concatenate((child_1[:len(child_1)//2],child_2[len(child_1)//2:]))
        pop_nextgen.append(new_par)
    return pop_nextgen


def mutation(pop_after_cross,mutation_rate,n_feat):
    mutation_range = int(mutation_rate*n_feat)
    pop_next_gen = []
    for n in range(0,len(pop_after_cross)):
        chromo = pop_after_cross[n]
        rand_posi = []
        for i in range(0,mutation_range):
            pos = randint(0,n_feat-1)
            rand_posi.append(pos)
        for j in rand_posi:
            chromo[j] = not chromo[j]
        pop_next_gen.append(chromo)
    return pop_next_gen


In [None]:
def generations(df,label,size,n_feat,n_parents,mutation_rate,n_gen,X_train,
                                   X_test, Y_train, Y_test):
    best_chromo= []
    best_score= []
    population_nextgen=initilization_of_population(size,n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print('Best score in generation',i+1,':',scores[:1])  #2
        pop_after_sel = selection(pop_after_fit,n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate,n_feat)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo,best_score

In [None]:
data = pd.read_csv("weatherAUS.csv")
n = 95460
data.drop(data.tail(n).index,inplace = True)
label = data["RainTomorrow"]
label = np.where(label == 'Yes',1,0)
data.drop(["Date","Location"],axis = 1,inplace = True)

print("Rainfall in Australia dataset:\n",data.shape[0],"Records\n",data.shape[1],"Features")

Rainfall in Australia dataset:
 50000 Records
 21 Features


In [None]:
data['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
data['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)


In [None]:
display(data.head())
#print("All the features in this dataset have continuous values")

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,0.0
1,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,0.0,0.0
2,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,0.0
3,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,0.0,0.0
4,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0


In [None]:
data.isnull().sum()

MinTemp            524
MaxTemp            388
Rainfall          1278
Evaporation      25953
Sunshine         31559
WindGustDir       5508
WindGustSpeed     5501
WindDir9am        5566
WindDir3pm        2429
WindSpeed9am      1128
WindSpeed3pm      1755
Humidity9am        781
Humidity3pm       1394
Pressure9am       6991
Pressure3pm       6945
Cloud9am         20386
Cloud3pm         20824
Temp9am            500
Temp3pm           1155
RainToday         1278
RainTomorrow      1277
dtype: int64

In [None]:
data.dtypes

MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday        float64
RainTomorrow     float64
dtype: object

In [None]:
#Setting numerical values with mean and categorical with max count
data = data.apply(lambda x:x.fillna(x.mean())
if x.dtype =='float' else x.fillna(x.value_counts().index[0]))

In [None]:
data.isnull().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [None]:
data.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,13.4,22.9,0.6,5.223381,7.333673,W,44.0,W,WNW,20.0,...,71.0,22.0,1007.7,1007.1,8.0,4.685689,16.9,21.8,0.0,0.0
1,7.4,25.1,0.0,5.223381,7.333673,WNW,44.0,NNW,WSW,4.0,...,44.0,25.0,1010.6,1007.8,4.652601,4.685689,17.2,24.3,0.0,0.0
2,12.9,25.7,0.0,5.223381,7.333673,WSW,46.0,W,WSW,19.0,...,38.0,30.0,1007.6,1008.7,4.652601,2.0,21.0,23.2,0.0,0.0
3,9.2,28.0,0.0,5.223381,7.333673,NE,24.0,SE,E,11.0,...,45.0,16.0,1017.6,1012.8,4.652601,4.685689,18.1,26.5,0.0,0.0
4,17.5,32.3,1.0,5.223381,7.333673,W,41.0,ENE,NW,7.0,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0


In [None]:
data['WindGustDir'].unique()

array(['W', 'WNW', 'WSW', 'NE', 'NNW', 'N', 'NNE', 'SW', 'S', 'ENE',
       'SSE', 'NW', 'SE', 'ESE', 'E', 'SSW'], dtype=object)

In [None]:
data['WindGustDir'].nunique()

16

In [None]:
data['WindGustDir'].replace(['W', 'WNW', 'WSW', 'NE', 'NNW', 'N', 'NNE', 'SW', 'ENE', 'SSE','S','NW','SE', 'ESE','E','SSW'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], inplace=True)
data['WindDir9am'].replace(['W', 'WNW', 'WSW', 'NE', 'NNW', 'N', 'NNE', 'SW', 'ENE', 'SSE','S','NW','SE', 'ESE','E','SSW'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], inplace=True)
data['WindDir3pm'].replace(['W', 'WNW', 'WSW', 'NE', 'NNW', 'N', 'NNE', 'SW', 'ENE', 'SSE','S','NW','SE', 'ESE','E','SSW'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], inplace=True)

In [None]:
data.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,13.4,22.9,0.6,5.223381,7.333673,0,44.0,0,1,20.0,...,71.0,22.0,1007.7,1007.1,8.0,4.685689,16.9,21.8,0.0,0.0
1,7.4,25.1,0.0,5.223381,7.333673,1,44.0,4,2,4.0,...,44.0,25.0,1010.6,1007.8,4.652601,4.685689,17.2,24.3,0.0,0.0
2,12.9,25.7,0.0,5.223381,7.333673,2,46.0,0,2,19.0,...,38.0,30.0,1007.6,1008.7,4.652601,2.0,21.0,23.2,0.0,0.0
3,9.2,28.0,0.0,5.223381,7.333673,3,24.0,12,14,11.0,...,45.0,16.0,1017.6,1012.8,4.652601,4.685689,18.1,26.5,0.0,0.0
4,17.5,32.3,1.0,5.223381,7.333673,0,41.0,8,11,7.0,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0


In [None]:
def fitness_score(population):
    scores = []
    for chromosome in population:
        logmodel.fit(X_train.iloc[:,chromosome],Y_train)
        predictions = logmodel.predict(X_test.iloc[:,chromosome])
        scores.append(accuracy_score(Y_test,predictions))
    scores, population = np.array(scores), np.array(population)
    inds = np.argsort(scores)
    return list(scores[inds][::-1]), list(population[inds,:][::-1])

In [None]:
score1 = acc_score(data,label)
score1

Unnamed: 0,Classifier,Accuracy
0,LinearSVM,1.0
1,Logistic,1.0
2,RandomForest,1.0
3,AdaBoost,1.0
4,DecisionTree,1.0
5,GradientBoosting,1.0
6,RadialSVM,0.835941
7,KNeighbors,0.834704


overfiting

In [None]:
logmodel = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data,label)
chromo_df_bc,score_bc=generations(data,label,size=80,n_feat=data.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

Best score in generation 1 : [1.0]
Best score in generation 2 : [1.0]
Best score in generation 3 : [1.0]
Best score in generation 4 : [1.0]
Best score in generation 5 : [1.0]


In [None]:
# import KNeighbors ClaSSifier from sklearn
from sklearn.neighbors import KNeighborsClassifier


# instantiate the model
knn = KNeighborsClassifier(n_neighbors=4)


# fit the model to the training set
logmodel= knn.fit(X_train, Y_train)
logmodel

In [None]:
X_train,X_test, Y_train, Y_test = split(data,label)
chromo_df_bc,score_bc=generations(data,label,size=80,n_feat=data.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

Best score in generation 1 : [0.8384]
Best score in generation 2 : [0.84208]
Best score in generation 3 : [0.84928]
Best score in generation 4 : [0.842]
Best score in generation 5 : [0.84008]


#***Improvement of 1-2% can be seen as the generation increases.***

In [None]:
plot(score_bc,0.9,1.0,c = "gold")

In [None]:
#def fitting(X, y, C, gamma):
    # Create training and testing samples
from sklearn.svm import SVC # for Support Vector Classification model
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0)

    # Fit the model
    # Note, available kernels: {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
    logmodel = SVC(kernel='rbf', probability=True, 1, gamma=0.1)
    clf = logmodel.fit(X_train, Y_train)