<h3>Загружаем данные

In [71]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [72]:
data = pd.read_csv("train.csv")
data_test=pd.read_csv("test.csv")

In [73]:
data.head()

Unnamed: 0,Text,Label
0,immer min maa mit em vatter vergliche de,ZH
1,das mues ich säge aber mir hend,LU
2,därartigi het natürlig ineren organisazion gän...,BS
3,schtei und bei gfroore e moondschiinnacht,BE
4,deet si jä,BS


<h3>Tokenizing text with scikit-learn

In [74]:
from sklearn.feature_extraction.text import CountVectorizer

In [75]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data["Text"])
X_train_counts.shape

(15616, 16292)

<h3>Downscaling

In [76]:
from sklearn.feature_extraction.text import TfidfTransformer

In [77]:
#tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
#X_train_tf = tf_transformer.transform(X_train_counts)
#X_train_tf.shape

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(15616, 16292)

<h3>Training a classifier

In [78]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [79]:
data["Label"].head()

0    ZH
1    LU
2    BS
3    BE
4    BS
Name: Label, dtype: object

In [80]:
Y_train=np.copy(data["Label"])
Y_train[Y_train=='ZH']=1
Y_train[Y_train=='LU']=2
Y_train[Y_train=='BS']=3
Y_train[Y_train=='BE']=4
Y_train

array([1, 2, 3, ..., 3, 2, 4], dtype=object)

In [81]:
naive_bayes = MultinomialNB().fit(X_train_tfidf, data["Label"])

In [82]:
X_test_counts = count_vect.transform(data_test[" Text"])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

Y_test = naive_bayes.predict(X_test_tfidf)

In [83]:
Y_test

array(['ZH', 'ZH', 'LU', ..., 'LU', 'BE', 'BE'], 
      dtype='<U2')

<h3>Building a pipeline

In [84]:
from sklearn.pipeline import Pipeline

In [85]:
naive_bayes = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
])

In [86]:
naive_bayes.fit(data["Text"], data["Label"])

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [87]:
Y_test=naive_bayes.predict(data_test[" Text"])

<h3>Evaluation of the performance on the test set

In [88]:
predicted = naive_bayes.predict(data["Text"])
accuracy_score(data["Label"],predicted)

0.93205686475409832

In [89]:
pd.DataFrame({'Id':range(1,data_test.shape[0]+1),'Prediction':Y_test}).to_csv('Prediction1.csv',index=False)

<h3>Parameter tuning using grid search

In [90]:
from sklearn.model_selection import GridSearchCV

In [91]:
parameters = {'vect__ngram_range': [(1,8),],
              'vect__analyzer' :('char_wb',),
              'vect__binary' :(False,),
               'tfidf__norm': ('l2',),
               'tfidf__use_idf': (True,),
               'tfidf__smooth_idf': (False,),
               'tfidf__sublinear_tf': (True,),
               'clf__alpha': (0.0002,),
}
gs_naive_bayes = GridSearchCV (naive_bayes,  parameters ,  n_jobs = - 1 )
gs_naive_bayes = gs_naive_bayes.fit(data["Text"], data["Label"])

In [92]:
predicted = gs_naive_bayes.predict(data["Text"])
accuracy_score(data["Label"],predicted)

0.95113985655737709

In [93]:
Y_test = gs_naive_bayes.predict(data_test[" Text"])

In [94]:
pd.DataFrame({'Id':range(1,data_test.shape[0]+1),'Prediction':Y_test}).to_csv('Prediction2.csv',index=False)

In [95]:
gs_naive_bayes.best_params_

{'clf__alpha': 0.0002,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': False,
 'tfidf__sublinear_tf': True,
 'tfidf__use_idf': True,
 'vect__analyzer': 'char_wb',
 'vect__binary': False,
 'vect__ngram_range': (1, 8)}

In [96]:
cross_val_score(gs_naive_bayes,data["Text"], data["Label"],scoring='accuracy')

array([ 0.77222969,  0.77045716,  0.78704594])

<h3>SGDClassifier

In [97]:
from sklearn.linear_model import SGDClassifier

In [98]:
SGD = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier()),
])

In [99]:
SGD.fit(data["Text"], data["Label"])

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [100]:
predicted = SGD.predict(data["Text"])
accuracy_score(data["Label"],predicted)

0.93961321721311475

In [101]:
Y_test = SGD.predict(data_test[" Text"])
pd.DataFrame({'Id':range(1,data_test.shape[0]+1),'Prediction':Y_test}).to_csv('Prediction3.csv',index=False)

In [102]:
SGDClassifier().get_params().keys()

dict_keys(['alpha', 'average', 'class_weight', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'n_iter', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'verbose', 'warm_start'])

In [103]:
parameters = {'vect__ngram_range': [(1,5),],
              'vect__analyzer' :('char_wb',),
              'vect__binary' :(False,),
               'tfidf__norm': ('l2',),
               'tfidf__use_idf': (True,),
               'tfidf__smooth_idf': (False,),
               'tfidf__sublinear_tf': (True,),
               'clf__alpha': (0.0001035,),
               'clf__loss': ('modified_huber',),
               'clf__l1_ratio': (0.165,),
               'clf__penalty': ('l2',),
               'clf__power_t': (0.5,),
               'clf__verbose': (0,),
               'clf__warm_start': (True,),
               'clf__fit_intercept': (True,),
               'clf__average': (False,),
               'clf__n_iter': (700,),

}
gs_SGD = GridSearchCV (SGD,  parameters ,  n_jobs = - 1 )
gs_SGD = gs_SGD.fit(data["Text"], data["Label"])

In [104]:
predicted = gs_SGD.predict(data["Text"])
accuracy_score(data["Label"],predicted)

0.96695696721311475

In [105]:
Y_test = gs_SGD.predict(data_test[" Text"])
pd.DataFrame({'Id':range(1,data_test.shape[0]+1),'Prediction':Y_test}).to_csv('Prediction4.csv',index=False)

In [106]:
gs_SGD.best_params_

{'clf__alpha': 0.0001035,
 'clf__average': False,
 'clf__fit_intercept': True,
 'clf__l1_ratio': 0.165,
 'clf__loss': 'modified_huber',
 'clf__n_iter': 700,
 'clf__penalty': 'l2',
 'clf__power_t': 0.5,
 'clf__verbose': 0,
 'clf__warm_start': True,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': False,
 'tfidf__sublinear_tf': True,
 'tfidf__use_idf': True,
 'vect__analyzer': 'char_wb',
 'vect__binary': False,
 'vect__ngram_range': (1, 5)}

In [107]:
cross_val_score(gs_SGD,data["Text"], data["Label"],scoring='accuracy')

array([ 0.84059919,  0.83787937,  0.8493177 ])

<h1>RidgeClassifier

In [108]:
from sklearn.linear_model import RidgeClassifier

In [109]:
RC = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RidgeClassifier()),
])

In [110]:
RC.fit(data["Text"], data["Label"])

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001))])

In [111]:
predicted = RC.predict(data["Text"])
accuracy_score(data["Label"],predicted)

0.96657274590163933

In [112]:
Y_test = RC.predict(data_test[" Text"])
pd.DataFrame({'Id':range(1,data_test.shape[0]+1),'Prediction':Y_test}).to_csv('Prediction5.csv',index=False)

In [113]:
parameters = {'vect__ngram_range': [(1,5),],
              'vect__analyzer' :('char_wb',),
              'vect__binary' :(False,),
               'tfidf__norm': ('l2',),
               'tfidf__use_idf': (True,),
               'tfidf__smooth_idf': (False,),
               'tfidf__sublinear_tf': (True,),
               'clf__alpha': (1e-2,),
}
gs_RC = GridSearchCV (RC,  parameters ,  n_jobs = - 1 )
gs_RC = gs_RC.fit(data["Text"], data["Label"])

In [114]:
predicted = gs_RC.predict(data["Text"])
accuracy_score(data["Label"],predicted)

0.97982838114754101

In [115]:
Y_test = gs_RC.predict(data_test[" Text"])
pd.DataFrame({'Id':range(1,data_test.shape[0]+1),'Prediction':Y_test}).to_csv('Prediction5.csv',index=False)

In [116]:
gs_RC.best_params_

{'clf__alpha': 0.01,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': False,
 'tfidf__sublinear_tf': True,
 'tfidf__use_idf': True,
 'vect__analyzer': 'char_wb',
 'vect__binary': False,
 'vect__ngram_range': (1, 5)}

In [117]:
cross_val_score(gs_RC,data["Text"], data["Label"],scoring='accuracy')

array([ 0.73170732,  0.73876297,  0.73976552])

<h1>PassiveAggressiveClassifier

In [118]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [119]:
PAC = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', PassiveAggressiveClassifier()),
])

In [120]:
PAC.fit(data["Text"], data["Label"])

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...', n_iter=5, n_jobs=1, random_state=None,
              shuffle=True, verbose=0, warm_start=False))])

In [121]:
predicted =PAC.predict(data["Text"])
accuracy_score(data["Label"],predicted)

0.9801485655737705

In [122]:
Y_test = PAC.predict(data_test[" Text"])
pd.DataFrame({'Id':range(1,data_test.shape[0]+1),'Prediction':Y_test}).to_csv('Prediction6.csv',index=False)

In [123]:
parameters = {'vect__ngram_range': [(1,5),],
              'vect__analyzer' :('char_wb',),
              'vect__binary' :(False,),
               'tfidf__norm': ('l2',),
               'tfidf__use_idf': (True,),
               'tfidf__smooth_idf': (False,),
               'tfidf__sublinear_tf': (True,),
               'clf__loss': ('modified_huber',),
               'clf__C': (1.1,),
}
gs_RC = GridSearchCV (PAC,  parameters ,  n_jobs = - 1 )
gs_RC = gs_RC.fit(data["Text"], data["Label"])

In [124]:
predicted = gs_RC.predict(data["Text"])
accuracy_score(data["Label"],predicted)

0.97669057377049184

In [125]:
Y_test = gs_RC.predict(data_test[" Text"])
pd.DataFrame({'Id':range(1,data_test.shape[0]+1),'Prediction':Y_test}).to_csv('Prediction6.csv',index=False)

In [126]:
gs_RC.best_params_

{'clf__C': 1.1,
 'clf__loss': 'modified_huber',
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': False,
 'tfidf__sublinear_tf': True,
 'tfidf__use_idf': True,
 'vect__analyzer': 'char_wb',
 'vect__binary': False,
 'vect__ngram_range': (1, 5)}

In [127]:
cross_val_score(PAC,data["Text"], data["Label"],scoring='accuracy')

array([ 0.7925869 ,  0.78390319,  0.80376706])

<h1>GradientBoostingClassifier

In [128]:
from sklearn.ensemble import GradientBoostingClassifier

In [129]:
GBC = GradientBoostingClassifier(n_estimators=1000)

In [130]:
count_vect = CountVectorizer()
count_vect.fit(data["Text"])
X_train_counts = count_vect.transform(data["Text"])
X_train_counts.shape

(15616, 16292)

In [131]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(15616, 16292)

In [132]:
GBC.fit(X_train_tfidf, data["Label"])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=1000, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [133]:
predicted =GBC.predict(tfidf_transformer.fit_transform(count_vect.transform(data["Text"])).toarray())
accuracy_score(data["Label"],predicted)

0.92021004098360659

In [134]:
Y_test =GBC.predict(tfidf_transformer.fit_transform(count_vect.transform(data_test[" Text"])).toarray())
pd.DataFrame({'Id':range(1,data_test.shape[0]+1),'Prediction':Y_test}).to_csv('Prediction7.csv',index=False)

<h1>RandomForestClassifier

In [135]:
from sklearn.ensemble import RandomForestClassifier

In [136]:
RFC = RandomForestClassifier(n_estimators=1000,max_depth=5)

In [137]:
count_vect = CountVectorizer()
count_vect.fit(data["Text"])
X_train_counts = count_vect.transform(data["Text"])
X_train_counts.shape

(15616, 16292)

In [138]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(15616, 16292)

In [139]:
RFC.fit(X_train_tfidf, data["Label"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [140]:
predicted =RFC.predict(tfidf_transformer.fit_transform(count_vect.transform(data["Text"])))
accuracy_score(data["Label"],predicted)

0.58324795081967218