In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, f1_score
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train = pd.read_csv('persons_pics_train.csv')
test = pd.read_csv('persons_pics_reserved.csv')

In [4]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2905,2906,2907,2908,2909,2910,2911,2912,2913,label
0,0.379085,0.312418,0.352941,0.445752,0.466667,0.473203,0.484967,0.513725,0.545098,0.571242,...,0.597386,0.620915,0.732026,0.852288,0.921569,0.937255,0.941176,0.950327,0.946405,John Ashcroft
1,0.45098,0.466667,0.512418,0.509804,0.45098,0.458824,0.51634,0.571242,0.607843,0.618301,...,0.430065,0.453595,0.679739,0.93464,0.985621,0.981699,0.963399,0.938562,0.87451,Ariel Sharon
2,0.589542,0.614379,0.688889,0.729412,0.738562,0.775163,0.789543,0.806536,0.820915,0.837908,...,0.733333,0.854902,0.937255,0.624837,0.128105,0.082353,0.10719,0.095425,0.100654,Colin Powell
3,0.39085,0.43268,0.499346,0.562091,0.597386,0.644444,0.670588,0.65098,0.637908,0.639216,...,0.713726,0.898039,0.968627,0.959477,0.83268,0.443137,0.266667,0.303268,0.313725,Jean Chretien
4,0.645752,0.626144,0.605229,0.585621,0.602614,0.664052,0.701961,0.709804,0.733333,0.771242,...,0.115033,0.129412,0.132026,0.129412,0.155556,0.171242,0.16732,0.176471,0.183007,Colin Powell


In [5]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2904,2905,2906,2907,2908,2909,2910,2911,2912,2913
0,0.315033,0.393464,0.517647,0.636601,0.652288,0.653595,0.677124,0.72549,0.71634,0.695425,...,0.031373,0.047059,0.047059,0.053595,0.05098,0.036601,0.027451,0.011765,0.007843,0.003922
1,0.518954,0.664052,0.690196,0.627451,0.576471,0.575163,0.605229,0.633987,0.666667,0.684967,...,0.568627,0.583007,0.577778,0.54902,0.479739,0.403922,0.427451,0.569935,0.772549,0.924183
2,0.196078,0.205229,0.219608,0.197386,0.179085,0.223529,0.295425,0.363399,0.411765,0.471895,...,0.471895,0.479739,0.486274,0.511111,0.528105,0.556863,0.586928,0.611765,0.610458,0.627451
3,0.330719,0.376471,0.413072,0.420915,0.430065,0.454902,0.477124,0.486274,0.487582,0.494118,...,0.688889,0.715033,0.724183,0.695425,0.308497,0.126797,0.12549,0.12549,0.111111,0.109804
4,0.405229,0.428758,0.487582,0.56732,0.619608,0.648366,0.665359,0.686275,0.701961,0.717647,...,0.611765,0.630065,0.627451,0.746405,0.700654,0.2,0.099346,0.095425,0.087582,0.08366


In [30]:
pca = PCA(svd_solver='full', n_components=500)
pca.fit(train.drop(['label'], axis = 1))
pca.explained_variance_ratio_.sum()

0.9916301255119919

In [31]:
train_data_transformed = pca.transform(train.drop(['label'], axis=1))
test_data_transformed = pca.transform(test)

In [32]:
tuned_parameters = [{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000], 'class_weight': [None, 'balanced']}]


cv = GridSearchCV(SVC(), tuned_parameters, refit=True, verbose=3)
cv.fit((train_data_transformed), train['label'])

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END C=1, class_weight=None, gamma=0.001, kernel=linear;, score=0.797 total time=   0.2s
[CV 2/5] END C=1, class_weight=None, gamma=0.001, kernel=linear;, score=0.804 total time=   0.2s
[CV 3/5] END C=1, class_weight=None, gamma=0.001, kernel=linear;, score=0.792 total time=   0.2s
[CV 4/5] END C=1, class_weight=None, gamma=0.001, kernel=linear;, score=0.789 total time=   0.3s
[CV 5/5] END C=1, class_weight=None, gamma=0.001, kernel=linear;, score=0.796 total time=   0.3s
[CV 1/5] END C=1, class_weight=None, gamma=0.001, kernel=poly;, score=0.338 total time=   0.4s
[CV 2/5] END C=1, class_weight=None, gamma=0.001, kernel=poly;, score=0.340 total time=   0.4s
[CV 3/5] END C=1, class_weight=None, gamma=0.001, kernel=poly;, score=0.340 total time=   0.3s
[CV 4/5] END C=1, class_weight=None, gamma=0.001, kernel=poly;, score=0.343 total time=   0.2s
[CV 5/5] END C=1, class_weight=None, gamma=0.001, kernel=poly;, score=0.3

[CV 4/5] END C=10, class_weight=None, gamma=0.001, kernel=linear;, score=0.789 total time=   0.2s
[CV 5/5] END C=10, class_weight=None, gamma=0.001, kernel=linear;, score=0.796 total time=   0.3s
[CV 1/5] END C=10, class_weight=None, gamma=0.001, kernel=poly;, score=0.338 total time=   0.5s
[CV 2/5] END C=10, class_weight=None, gamma=0.001, kernel=poly;, score=0.343 total time=   0.5s
[CV 3/5] END C=10, class_weight=None, gamma=0.001, kernel=poly;, score=0.340 total time=   0.4s
[CV 4/5] END C=10, class_weight=None, gamma=0.001, kernel=poly;, score=0.343 total time=   0.3s
[CV 5/5] END C=10, class_weight=None, gamma=0.001, kernel=poly;, score=0.340 total time=   0.3s
[CV 1/5] END C=10, class_weight=None, gamma=0.001, kernel=rbf;, score=0.789 total time=   0.3s
[CV 2/5] END C=10, class_weight=None, gamma=0.001, kernel=rbf;, score=0.747 total time=   0.3s
[CV 3/5] END C=10, class_weight=None, gamma=0.001, kernel=rbf;, score=0.777 total time=   0.4s
[CV 4/5] END C=10, class_weight=None, g

[CV 2/5] END C=100, class_weight=None, gamma=0.001, kernel=poly;, score=0.351 total time=   0.8s
[CV 3/5] END C=100, class_weight=None, gamma=0.001, kernel=poly;, score=0.355 total time=   0.4s
[CV 4/5] END C=100, class_weight=None, gamma=0.001, kernel=poly;, score=0.362 total time=   0.4s
[CV 5/5] END C=100, class_weight=None, gamma=0.001, kernel=poly;, score=0.355 total time=   0.4s
[CV 1/5] END C=100, class_weight=None, gamma=0.001, kernel=rbf;, score=0.842 total time=   0.3s
[CV 2/5] END C=100, class_weight=None, gamma=0.001, kernel=rbf;, score=0.834 total time=   0.4s
[CV 3/5] END C=100, class_weight=None, gamma=0.001, kernel=rbf;, score=0.808 total time=   0.4s
[CV 4/5] END C=100, class_weight=None, gamma=0.001, kernel=rbf;, score=0.811 total time=   0.4s
[CV 5/5] END C=100, class_weight=None, gamma=0.001, kernel=rbf;, score=0.800 total time=   0.4s
[CV 1/5] END C=100, class_weight=None, gamma=0.001, kernel=sigmoid;, score=0.827 total time=   0.3s
[CV 2/5] END C=100, class_weight

[CV 4/5] END C=1000, class_weight=None, gamma=0.001, kernel=poly;, score=0.377 total time=   0.3s
[CV 5/5] END C=1000, class_weight=None, gamma=0.001, kernel=poly;, score=0.392 total time=   0.3s
[CV 1/5] END C=1000, class_weight=None, gamma=0.001, kernel=rbf;, score=0.835 total time=   0.2s
[CV 2/5] END C=1000, class_weight=None, gamma=0.001, kernel=rbf;, score=0.830 total time=   0.3s
[CV 3/5] END C=1000, class_weight=None, gamma=0.001, kernel=rbf;, score=0.811 total time=   0.3s
[CV 4/5] END C=1000, class_weight=None, gamma=0.001, kernel=rbf;, score=0.811 total time=   0.3s
[CV 5/5] END C=1000, class_weight=None, gamma=0.001, kernel=rbf;, score=0.804 total time=   0.2s
[CV 1/5] END C=1000, class_weight=None, gamma=0.001, kernel=sigmoid;, score=0.797 total time=   0.2s
[CV 2/5] END C=1000, class_weight=None, gamma=0.001, kernel=sigmoid;, score=0.808 total time=   0.2s
[CV 3/5] END C=1000, class_weight=None, gamma=0.001, kernel=sigmoid;, score=0.792 total time=   0.2s
[CV 4/5] END C=1

In [33]:
cv.best_params_

{'C': 100, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'rbf'}

In [34]:
pred = cv.predict(test_data_transformed)

In [35]:
pred

array(['George W Bush', 'Ariel Sharon', 'Tony Blair', 'Jacques Chirac',
       'Tony Blair', 'Colin Powell', 'Donald Rumsfeld', 'Colin Powell',
       'Tony Blair', 'Gerhard Schroeder', 'Donald Rumsfeld',
       'Hugo Chavez', 'George W Bush', 'Hugo Chavez', 'Colin Powell',
       'George W Bush', 'Ariel Sharon', 'Colin Powell', 'John Ashcroft',
       'Gerhard Schroeder', 'Ariel Sharon', 'Donald Rumsfeld',
       'Ariel Sharon', 'George W Bush', 'George W Bush',
       'Donald Rumsfeld', 'Donald Rumsfeld', 'Tony Blair',
       'Serena Williams', 'Jean Chretien', 'George W Bush',
       'George W Bush', 'George W Bush', 'George W Bush', 'George W Bush',
       'Colin Powell', 'Donald Rumsfeld', 'Jacques Chirac',
       'George W Bush', 'Gerhard Schroeder', 'Colin Powell',
       'Donald Rumsfeld', 'Gerhard Schroeder', 'George W Bush',
       'Tony Blair', 'George W Bush', 'George W Bush', 'Tony Blair',
       'Colin Powell', 'Tony Blair', 'Serena Williams', 'Jean Chretien',
       'Ton

In [38]:
logreg = GradientBoostingClassifier()
logreg.fit(train_data_transformed, train)

ValueError: y should be a 1d array, got an array of shape (1326, 2915) instead.