# Data Science London + Scikit-learn

Dataset Description
This is a synthetic data set of 40 features, representing objects from two classes (labeled as 0 or 1). The training set has 1000 samples and the testing set has 9000.

Current score - 0.9127 %

Rank - 136/190 (Top 72%)

In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA

In [84]:
# load data
train = pd.read_csv('data/london/train.csv', header=None)
train_label = pd.read_csv('data/london/trainLabels.csv', header=None)
test = pd.read_csv('data/london/test.zip', header=None, compression='zip')

In [85]:
# Detect Outliers
q1 = train.quantile(0.25)
q3 = train.quantile(0.75)
iqr = q3 - q1

# calculate maximum and minimum
maximum = q3 + 1.5 * iqr
minimum = q1 - 1.5 * iqr

# find outliers
df = train[(train < minimum) | (train > maximum)]

outlier_exist = np.all(df.isnull())
print('Outliers exists: ', not outlier_exist)

Outliers exists:  True


In [86]:
# Solution 2: Replace Outliers with Median

def mean_outliers(data: pd.DataFrame) -> pd.DataFrame:
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    maximum = q3 + 1.5 * iqr
    minimum = q1 - 1.5 * iqr
    
    for col in data.columns:
        strategy = data[col].mean()
        data[col] = np.where(data[col] > maximum[col], strategy, data[col])
        data[col] = np.where(data[col] < minimum[col], strategy, data[col])
    
    return data

train = mean_outliers(train)
test = mean_outliers(test)

In [87]:
# Use Normalizer to scale data
scaler = Normalizer().fit(train)
train = scaler.transform(train)

scaler = Normalizer().fit(test)
test = scaler.transform(test)

In [88]:
# PCA
pca = PCA(n_components=12)

train = pca.fit_transform(train)
test = pca.transform(test)

In [89]:
X, y = train, np.ravel(train_label)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=42)

In [90]:
# Tuned SVC

best_params = {'C': 1,
               'coef0': 0, 
               'degree': 1,
               'gamma': 2,
               'probability': True,
               'kernel': 'rbf', 
               'random_state': 666}

tuned_svc = SVC(**best_params)

tuned_svc.fit(X_train, y_train)

y_pred = tuned_svc.predict(test)

score = tuned_svc.score(X_test, y_test)

accuracy = score * 100

print(f'Accuracy: {accuracy:.2f}%') # PCA 12 - 92.73

Accuracy: 92.73%


In [91]:
submission = pd.DataFrame({
    'Id': np.arange(1, y_pred.shape[0] + 1),
    'Solution': np.round(y_pred).astype(int).flatten()
})

# save CSV
submission.to_csv('submission.csv', index=False)
print('Submission saved!')

Submission saved!
