In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

  from numpy.core.umath_tests import inner1d


In [53]:
elcerrito = pd.read_csv('./dataset/elcerrito150x100.csv')
elsobrante = pd.read_csv('./dataset/elsobrante150x100.csv')
pinole = pd.read_csv('./dataset/pinole150x100.csv')
hercules = pd.read_csv('./dataset/hercules150x100.csv')
sanpablo = pd.read_csv('./dataset/sanpablo150x100.csv')

In [61]:
def classification(city):
    
    X = city.iloc[:,14:45014]
    y = city.iloc[:,45014]

    X = X.astype('float32') / 255.
    
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.33, 
                                                    random_state=42,
                                                    stratify = y)
    
    print('X_train:', X_train.shape, 
          'X_test:', X_test.shape, 
          'y_train:', y_train.shape, 
          'y_test:', y_test.shape)
    
    baseline = [round(max(y_train.value_counts(normalize=True)),3), round(max(y_test.value_counts(normalize=True)),3)]
    
    model_name = ['Logistic Regression', 'Decision tree', 'random Forest', 'KNN', 'SVM', 'Gradient Boosting', 'Baseline']
    
    model = [
        LogisticRegression(random_state=42),
        DecisionTreeClassifier(random_state=42),
        RandomForestClassifier(random_state=42),
        KNeighborsClassifier(),
        svm.SVC(random_state=42),
        GradientBoostingClassifier(random_state=42)        
    ]
    
    params = [
        {'penalty': ['l1', 'l2'], 'C': [.1, .5, 1, 5, 10]},
        {},
        {'n_estimators': [5, 10, 15]},
        {'n_neighbors' : [3, 5, 7]},
        {'C': [1, 5, 10], 'gamma': [0.001, 0.01]},
        {'learning_rate': [.05, .1]}             
    ]
    
    accuracy = []
    
    for _ in range(6):
        print(model_name[_])
        gs = GridSearchCV(model[_], params[_], cv = None)
        gs.fit(X_train, y_train)
        accuracy.append([round(gs.score(X_train, y_train),3), round(gs.score(X_test, y_test),3)])
        
    accuracy.append(baseline)
    
    df = pd.DataFrame(data = accuracy, index = model_name, columns = ['train', 'test'])
    
    return df
    

In [69]:
elcerrito_results = classification(elcerrito)
elcerrito_results

X_train: (150, 45000) X_test: (75, 45000) y_train: (150,) y_test: (75,)
Logistic Regression
Decision tree
random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,0.66,0.667
Decision tree,1.0,0.547
random Forest,0.993,0.547
KNN,0.667,0.64
SVM,1.0,0.667
Gradient Boosting,1.0,0.56
Baseline,0.66,0.667


In [62]:
pinole_result = classification(pinole)
pinole_result

X_train: (179, 45000) X_test: (89, 45000) y_train: (179,) y_test: (89,)
Logistic Regression
Decision tree
random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,0.749,0.742
Decision tree,1.0,0.64
random Forest,0.989,0.708
KNN,0.827,0.809
SVM,1.0,0.742
Gradient Boosting,1.0,0.753
Baseline,0.749,0.742


In [57]:
elsobrante_results = classification(elsobrante)
elsobrante_results

X_train: (115, 45000) X_test: (58, 45000) y_train: (115,) y_test: (58,)
Logistic Regression
Decision Tree
Random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,0.956522,0.62069
Decision tree,1.0,0.534483
random Forest,0.991304,0.568966
KNN,0.756522,0.603448
SVM,1.0,0.586207
Gradient Boosting,1.0,0.62069
Baseline,0.583,0.586


In [58]:
hercules_results = classification(hercules)
hercules_results

X_train: (153, 45000) X_test: (76, 45000) y_train: (153,) y_test: (76,)
Logistic Regression
Decision Tree
Random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,0.679739,0.684211
Decision tree,1.0,0.539474
random Forest,0.993464,0.657895
KNN,0.771242,0.513158
SVM,1.0,0.684211
Gradient Boosting,1.0,0.671053
Baseline,0.68,0.684


In [60]:
sanpablo_results = classification(sanpablo)
sanpablo_results

X_train: (148, 45000) X_test: (74, 45000) y_train: (148,) y_test: (74,)
Logistic Regression
Decision Tree
Random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,1.0,0.783784
Decision tree,1.0,0.662162
random Forest,0.966216,0.743243
KNN,0.777027,0.756757
SVM,1.0,0.756757
Gradient Boosting,1.0,0.743243
Baseline,0.757,0.757


## Oversampling on train dataset

In [96]:
def classification_over(city):
    
    X = city.iloc[:,14:]
    y = city.iloc[:,45014]
    
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.33, 
                                                    random_state=42,
                                                    stratify = y)
    
    print('before oversampling: ', 
          y_train.value_counts().index[0], 
          ':', 
          y_train.value_counts().index[1], 
          '=',
          y_train.value_counts().values[0], 
          ':', 
          y_train.value_counts().values[1])
    
    # find all data of minority class
    zero = X_train[X_train['label'] == 0].shape[0]
    one = X_train[X_train['label'] == 1].shape[0]
    zero, one
    if zero < one:
        minority = X_train[X_train['label'] == 0]
        num = one - zero
        duplicate = minority.sample(n=num, replace=True, random_state=42)
    elif zero > one: 
        minority = X_train[X_train['label'] == 1]
        num = zero - one
        duplicate = minority.sample(n=num, replace=True, random_state=42)
        
    # create a oversamling X_train
    X_train_over = pd.concat([X_train, duplicate])
    
    # X_train1, X_test1, y_train1, y_test1 for model
    X_train1 = X_train_over.iloc[:,:45000]
    y_train1 = X_train_over.iloc[:,45000]
    X_test1 = X_test.iloc[:,:45000]
    y_test1 = X_test.iloc[:,45000]
    
    print('after oversampling: ', 
          y_train1.value_counts().index[0], 
          ':', 
          y_train1.value_counts().index[1], 
          '=',
          y_train1.value_counts().values[0], 
          ':', 
          y_train1.value_counts().values[1])
    
    
    print('X_train:', X_train1.shape, 
          'X_test:', X_test1.shape, 
          'y_train:', y_train1.shape, 
          'y_test:', y_test1.shape)
    
    baseline = [round(max(y_train1.value_counts(normalize=True)),3), round(max(y_test1.value_counts(normalize=True)),3)]
    
    model_name = ['Logistic Regression', 'Decision tree', 'random Forest', 'KNN', 'SVM', 'Gradient Boosting', 'Baseline']
    
    model = [
        LogisticRegression(random_state=42),
        DecisionTreeClassifier(random_state=42),
        RandomForestClassifier(random_state=42),
        KNeighborsClassifier(),
        svm.SVC(random_state=42),
        GradientBoostingClassifier(random_state=42)        
    ]
    
    params = [
        {'penalty': ['l1', 'l2'], 'C': [.1, .5, 1, 5, 10]},
        {},
        {'n_estimators': [5, 10, 15]},
        {'n_neighbors' : [3, 5, 7]},
        {'C': [1, 5, 10], 'gamma': [0.001, 0.01]},
        {'learning_rate': [.05, .1]}             
    ]
    
    accuracy = []
    
    for _ in range(6):
        print(model_name[_])
        gs = GridSearchCV(model[_], params[_], cv = None)
        gs.fit(X_train1, y_train1)
        accuracy.append([round(gs.score(X_train1, y_train1),3), round(gs.score(X_test1, y_test1),3)])
        
    accuracy.append(baseline)
    
    df = pd.DataFrame(data = accuracy, index = model_name, columns = ['train', 'test'])
    
    return df
    

In [97]:
classification_over(elcerrito)

before oversampling:  1 : 0 = 99 : 51
after oversampling:  1 : 0 = 99 : 99
X_train: (198, 45000) X_test: (75, 45000) y_train: (198,) y_test: (75,)
Logistic Regression
Decision tree
random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,1.0,0.507
Decision tree,1.0,0.507
random Forest,1.0,0.587
KNN,0.813,0.507
SVM,1.0,0.667
Gradient Boosting,1.0,0.627
Baseline,0.5,0.667


In [98]:
classification_over(elsobrante)

before oversampling:  1 : 0 = 67 : 48
after oversampling:  1 : 0 = 67 : 67
X_train: (134, 45000) X_test: (58, 45000) y_train: (134,) y_test: (58,)
Logistic Regression
Decision tree
random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,1.0,0.603
Decision tree,1.0,0.517
random Forest,0.993,0.552
KNN,0.776,0.552
SVM,1.0,0.586
Gradient Boosting,1.0,0.586
Baseline,0.5,0.586


In [94]:
classification_over(pinole)

before oversampling:  1 : 0 = 134 : 45
after oversampling:  1 : 0 = 134 : 134
X_train: (268, 45000) X_test: (89, 45000) y_train: (268,) y_test: (89,)
Logistic Regression
Decision tree
random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,1.0,0.73
Decision tree,1.0,0.719
random Forest,1.0,0.73
KNN,0.888,0.596
SVM,1.0,0.742
Gradient Boosting,1.0,0.708
Baseline,0.5,0.742


In [99]:
classification_over(hercules)

before oversampling:  0 : 1 = 104 : 49
after oversampling:  1 : 0 = 104 : 104
X_train: (208, 45000) X_test: (76, 45000) y_train: (208,) y_test: (76,)
Logistic Regression
Decision tree
random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,1.0,0.684
Decision tree,1.0,0.566
random Forest,1.0,0.645
KNN,0.812,0.434
SVM,1.0,0.684
Gradient Boosting,1.0,0.645
Baseline,0.5,0.684


In [100]:
classification_over(sanpablo)

before oversampling:  1 : 0 = 112 : 36
after oversampling:  1 : 0 = 112 : 112
X_train: (224, 45000) X_test: (74, 45000) y_train: (224,) y_test: (74,)
Logistic Regression
Decision tree
random Forest
KNN
SVM
Gradient Boosting


Unnamed: 0,train,test
Logistic Regression,1.0,0.784
Decision tree,1.0,0.622
random Forest,1.0,0.757
KNN,0.915,0.568
SVM,1.0,0.757
Gradient Boosting,1.0,0.703
Baseline,0.5,0.757
