# Preparation

In [1]:
# Import primary functions
import sklearn
import xgboost
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Import secondary functions
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize, scale
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Import tertiary functions
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)

# Define classification function

In [2]:
# Classification
def classification(X, y):
    xgbc=XGBClassifier()
    g=GaussianNB()
    b=BernoulliNB()
    k=KNeighborsClassifier()
    svc=SVC()
    d=DecisionTreeClassifier()
    log=LogisticRegression()
    gbc=GradientBoostingClassifier()
    rf=RandomForestClassifier()
    ab=AdaBoostClassifier()
    
    algos = [xgbc, g, b, k, svc, d, log, gbc, rf, ab]
    
    algos_name = ['XGBoost', 'Gaussian', 'Bernoulli', 'KNeighbors', 'SVC', 'DecisionTree', 'Logistic', 'GradientBoosting', 'RandomForest', 'AdaBoost']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=101)
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    
    result = pd.DataFrame(columns=['AccuracyScore', 'PrecisionScore', 'RecallScore', 'F1Score'], index=algos_name)
    
    for i in algos:
        predict = i.fit(X_train, y_train).predict(X_test)
        accuracy.append(accuracy_score(y_test, predict))
        precision.append(precision_score(y_test, predict))
        recall.append(recall_score(y_test, predict))
        f1.append(f1_score(y_test, predict))
    
    result.AccuracyScore = accuracy
    result.PrecisionScore = precision
    result.RecallScore = recall
    result.F1Score = f1
    
    return result.sort_values('AccuracyScore', ascending=False)

# Load the data

In [3]:
# Import data
df_train = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/train.csv', header=None)
labels = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/trainLabels.csv', header=None)
df_test = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/test.csv', header=None)

In [5]:
df_train.shape, labels.shape, df_test.shape

((1000, 40), (1000, 1), (9000, 40))

In [6]:
scaler = MinMaxScaler()

train = scaler.fit_transform(df_train)
test = scaler.transform(df_test)

In [7]:
classification(train, labels)

Unnamed: 0,AccuracyScore,PrecisionScore,RecallScore,F1Score
RandomForest,0.905,0.939394,0.877358,0.907317
GradientBoosting,0.89,0.911765,0.877358,0.894231
SVC,0.885,0.895238,0.886792,0.890995
XGBoost,0.88,0.886792,0.886792,0.886792
Gaussian,0.84,0.842593,0.858491,0.850467
KNeighbors,0.84,0.830357,0.877358,0.853211
Logistic,0.83,0.833333,0.849057,0.841121
AdaBoost,0.815,0.828571,0.820755,0.824645
DecisionTree,0.77,0.783019,0.783019,0.783019
Bernoulli,0.53,0.53,1.0,0.69281


# Perform prediction

In [8]:
xgbc = XGBClassifier()

predictions = xgbc.fit(train, labels).predict(test)

In [9]:
predictions

array([1, 0, 0, ..., 1, 0, 1])

# Generate submission

In [10]:
submission = pd.DataFrame()
submission['Id'] = list(range(1, 9001))
submission['Solution'] = predictions
submission.to_csv('submission.csv', index=False)

In [11]:
submission

Unnamed: 0,Id,Solution
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
8995,8996,1
8996,8997,1
8997,8998,1
8998,8999,0


In [12]:
print("Successfully saved as CSV file")

Successfully saved as CSV file
