# ML on Data Science London

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv('/kaggle/input/data-science-london-scikit-learn/train.csv',header=None)
test=pd.read_csv('/kaggle/input/data-science-london-scikit-learn/test.csv',header=None)
train_labels=pd.read_csv('/kaggle/input/data-science-london-scikit-learn/trainLabels.csv',header=None)
train.columns=['Col'+str(i) for i in range(1,train.shape[1]+1)]
test.columns=['Col'+str(i) for i in range(1,train.shape[1]+1)]
train['Label']=train_labels
train

In [None]:
train.info()

In [None]:
train.describe()

## EDA

In [None]:
sns.countplot(data=train,x='Label',palette='viridis')

In [None]:
train.corr()['Label'].sort_values(ascending=False).iloc[1:].head()

In [None]:
sns.distplot(train['Col15'],color='red')

In [None]:
sns.boxplot(data=train,x='Label',y='Col15',palette='rainbow')

In [None]:
plt.scatter(data=train,x='Col13',y='Col15',c='Label')

In [None]:
sns.jointplot(data=train,x='Col13',y='Col15',kind='hex',color='purple')

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(train.corr(),cmap='coolwarm')

## Machine Learning

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=train.drop('Label',axis=1)
y=train['Label']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model=LogisticRegression()
log_model.fit(X_train,y_train)
pred=log_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(confusion_matrix(y_test,pred),'\n',classification_report(y_test,pred))

### K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
error=[]
for i in range(1,40):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred=knn.predict(X_test)
    error.append(np.mean(pred!=y_test))

In [None]:
plt.plot(range(1,40),error,'r--o',markerfacecolor='blue')
plt.ylabel('Error')
plt.xlabel('No. of neighbors')

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
pred=knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred),'\n',classification_report(y_test,pred))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
pred=dtree.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred),'\n',classification_report(y_test,pred))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)
pred=rfc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred),'\n',classification_report(y_test,pred))

### Support Vector Machines

In [None]:
from sklearn.svm import SVC

In [None]:
svc=SVC()
svc.fit(X_train,y_train)
pred=svc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred),'\n',classification_report(y_test,pred))

### XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
pred=xgb.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred),'\n',classification_report(y_test,pred))

## Final Submission

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X,y)
pred=knn.predict(test)

In [None]:
final=pd.DataFrame({'Id':np.arange(1,pred.shape[0]+1),'Solution':pred}).set_index('Id')

In [None]:
final.to_csv('final.csv')

## Extra : Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca=PCA(n_components=2)
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.transform(X_test)

In [None]:
plt.scatter(X_train_pca[:,0],X_train_pca[:,1],c=y_train)
plt.xlabel('Component - 1')
plt.ylabel('Component - 2')