In [14]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
files=['Drugs\\SMSSpamCollection.txt']
columns=['Is it spam?','Message']
df = pd.DataFrame()
for file in files:
    df = df.append(pd.read_csv(file, sep="\t", header=None))
    
df.columns = columns
df.head()

Unnamed: 0,Is it spam?,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#Cleaning the y
df['Is it spam?'] = np.where(df['Is it spam?'] == 'ham',0,1)

In [4]:
df

Unnamed: 0,Is it spam?,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [5]:
y = df['Is it spam?']
x = df['Message']

In [16]:
test_size=0.3
seed = 42
scoring=('neg_mean_squared_error','neg_mean_absolute_error','neg_median_absolute_error','accuracy','f1','r2')
num_folds = 5
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}

from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=test_size)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
vect = CountVectorizer(min_df=5, stop_words='english')
X = vect.fit_transform(x)
#X_test_dtm = vect.transform(X_test)
print('Dimensiune vocabular: ', len(vect.get_feature_names()))

Dimensiune vocabular:  1603


In [8]:
grid = GridSearchCV(LogisticRegression(solver='lbfgs', max_iter=1000), param_grid=param_grid, cv=5, n_jobs=4)
grid.fit(X, y)
print('best cross validation score:', grid.best_score_)
print('best params:', grid.best_params_)

best cross validation score: 0.9811557788944724
best params: {'C': 10}


In [19]:
# Vom creea data frame-ul cu valorile rezultate
dataframe = pd.DataFrame(columns=['Model_Name','Search_Type','Mean_absolute_error','Mean_squared_error',
                                 'Median_absolute_error','Accuracy','F1','R2','fit_time'], index=['0','1','2','3','4','5','6','7'])
dataframe

Unnamed: 0,Model_Name,Search_Type,Mean_absolute_error,Mean_squared_error,Median_absolute_error,Accuracy,F1,R2,fit_time
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,


In [15]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])

In [17]:
# Ne generam algoritmul
from sklearn.neural_network import MLPClassifier
# Oscileaza intre 3 si 7 vecini pentru a stabili ce numar de vecini are cel mai bun randament
parameters = {
    'hidden_layer_sizes': [
     (1,),(2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,)]
            }
mod = cross_validate(GridSearchCV(MLPClassifier(learning_rate='adaptive'), parameters, cv=3, scoring='neg_mean_squared_error'), X, y, cv=5, scoring=scoring)
print(mod)

{'fit_time': array([50.04635024, 50.1433847 , 53.29129982, 54.16164994, 49.47272134]), 'score_time': array([0.00299191, 0.00398922, 0.00496292, 0.00598264, 0.00500727]), 'test_neg_mean_squared_error': array([-0.01165919, -0.01883408, -0.01885099, -0.01705566, -0.01795332]), 'train_neg_mean_squared_error': array([-0.0013462 , -0.0013462 , -0.00157021, -0.00089726, -0.00112158]), 'test_neg_mean_absolute_error': array([-0.01165919, -0.01883408, -0.01885099, -0.01705566, -0.01795332]), 'train_neg_mean_absolute_error': array([-0.0013462 , -0.0013462 , -0.00157021, -0.00089726, -0.00112158]), 'test_neg_median_absolute_error': array([-0., -0., -0., -0., -0.]), 'train_neg_median_absolute_error': array([-0., -0., -0., -0., -0.]), 'test_accuracy': array([0.98834081, 0.98116592, 0.98114901, 0.98294434, 0.98204668]), 'train_accuracy': array([0.9986538 , 0.9986538 , 0.99842979, 0.99910274, 0.99887842]), 'test_f1': array([0.95532646, 0.92682927, 0.92473118, 0.93189964, 0.93103448]), 'train_f1': arra

In [20]:
# Datele pentru KNN cu Grid Search CV
dataframe['Model_Name']['0'] = 'MLPC'
dataframe['Search_Type']['0'] = 'Grid Search CV'
dataframe['fit_time']['0'] = mod['fit_time'].mean()
dataframe['Median_absolute_error']['0'] = mod['test_neg_median_absolute_error'].mean()
dataframe['Mean_absolute_error']['0'] = mod['test_neg_mean_absolute_error'].mean()
dataframe['Mean_squared_error']['0'] = mod['test_neg_mean_squared_error'].mean()
dataframe['F1']['0']=mod['test_f1'].mean()
dataframe['R2']['0']=mod['test_r2'].mean()
dataframe['Accuracy']['0']=mod['test_accuracy'].mean()

In [21]:
dataframe

Unnamed: 0,Model_Name,Search_Type,Mean_absolute_error,Mean_squared_error,Median_absolute_error,Accuracy,F1,R2,fit_time
0,MLPC,Grid Search CV,-0.0168706,-0.0168706,0.0,0.983129,0.933964,0.854648,51.4231
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,


In [22]:
mod = cross_validate(RandomizedSearchCV(MLPClassifier(learning_rate='adaptive'), parameters, cv=3, scoring='neg_mean_squared_error'), X, y, cv=5, scoring=scoring)
print(mod)

{'fit_time': array([58.45459342, 61.15344286, 58.0395937 , 60.72323918, 57.85889316]), 'score_time': array([0.0029912 , 0.00299239, 0.00398922, 0.00449872, 0.00299168]), 'test_neg_mean_squared_error': array([-0.01345291, -0.01793722, -0.01795332, -0.01974865, -0.01795332]), 'train_neg_mean_squared_error': array([-0.00112183, -0.00157056, -0.00067295, -0.00044863, -0.00157021]), 'test_neg_mean_absolute_error': array([-0.01345291, -0.01793722, -0.01795332, -0.01974865, -0.01795332]), 'train_neg_mean_absolute_error': array([-0.00112183, -0.00157056, -0.00067295, -0.00044863, -0.00157021]), 'test_neg_median_absolute_error': array([-0., -0., -0., -0., -0.]), 'train_neg_median_absolute_error': array([-0., -0., -0., -0., -0.]), 'test_accuracy': array([0.98654709, 0.98206278, 0.98204668, 0.98025135, 0.98204668]), 'train_accuracy': array([0.99887817, 0.99842944, 0.99932705, 0.99955137, 0.99842979]), 'test_f1': array([0.94809689, 0.93055556, 0.92857143, 0.92028986, 0.93103448]), 'train_f1': arra

In [44]:
# Datele pentru KNN cu Grid Search CV
dataframe['Model_Name']['1'] = 'MLPC'
dataframe['Search_Type']['1'] = 'Random Search CV'
dataframe['fit_time']['1'] = mod['fit_time'].mean()
dataframe['Median_absolute_error']['1'] = mod['test_neg_median_absolute_error'].mean()
dataframe['Mean_absolute_error']['1'] = mod['test_neg_mean_absolute_error'].mean()
dataframe['Mean_squared_error']['1'] = mod['test_neg_mean_squared_error'].mean()
dataframe['F1']['1']=mod['test_f1'].mean()
dataframe['R2']['1']=mod['test_r2'].mean()
dataframe['Accuracy']['1']=mod['test_accuracy'].mean()

In [32]:
dataframe

Unnamed: 0,Model_Name,Search_Type,Mean_absolute_error,Mean_squared_error,Median_absolute_error,Accuracy,F1,R2,fit_time
0,MLPC,Grid Search CV,-0.0168706,-0.0168706,0.0,0.983129,0.933964,0.854648,51.4231
1,MLPC,Random Search CV,0.0174091,0.0174091,-0.0,0.982591,0.93171,0.850008,59.246
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,


In [33]:
from sklearn.svm import SVC

parameters = {
    'gamma':['auto','scale'],
            }
svc = SVC()
model = RandomizedSearchCV(svc, parameters, cv=3, n_iter=2)
mod = cross_validate(model, X, y, cv=5, return_train_score=True, scoring=scoring)

In [42]:
# Datele pentru KNN cu Grid Search CV
dataframe['Model_Name']['3'] = 'SVC'
dataframe['Search_Type']['3'] = 'Random Search CV'
dataframe['fit_time']['3'] = mod['fit_time'].mean()
dataframe['Median_absolute_error']['3'] = mod['test_neg_median_absolute_error'].mean()
dataframe['Mean_absolute_error']['3'] = mod['test_neg_mean_absolute_error'].mean()
dataframe['Mean_squared_error']['3'] = mod['test_neg_mean_squared_error'].mean()
dataframe['F1']['3']=mod['test_f1'].mean()
dataframe['R2']['3']=mod['test_r2'].mean()
dataframe['Accuracy']['3']=mod['test_accuracy'].mean()

In [43]:
dataframe

Unnamed: 0,Model_Name,Search_Type,Mean_absolute_error,Mean_squared_error,Median_absolute_error,Accuracy,F1,R2,fit_time
0,MLPC,Grid Search CV,-0.0168706,-0.0168706,0.0,0.983129,0.933964,0.854648,51.4231
1,MLPC,Random Search CV,0.0174091,0.0174091,-0.0,0.982591,0.93171,0.850008,59.246
2,SVC,Grid Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
3,SVC,Random Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
4,,,,,,,,,
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,


In [37]:
parameters = {
    'gamma':['auto','scale'],
            }
svc = SVC()
model = GridSearchCV(svc, parameters, cv=3)
mod = cross_validate(model, X, y, cv=5, return_train_score=True, scoring=scoring)

In [40]:
# Datele pentru KNN cu Grid Search CV
dataframe['Model_Name']['2'] = 'SVC'
dataframe['Search_Type']['2'] = 'Grid Search CV'
dataframe['fit_time']['2'] = mod['fit_time'].mean()
dataframe['Median_absolute_error']['2'] = mod['test_neg_median_absolute_error'].mean()
dataframe['Mean_absolute_error']['2'] = mod['test_neg_mean_absolute_error'].mean()
dataframe['Mean_squared_error']['2'] = mod['test_neg_mean_squared_error'].mean()
dataframe['F1']['2']=mod['test_f1'].mean()
dataframe['R2']['2']=mod['test_r2'].mean()
dataframe['Accuracy']['2']=mod['test_accuracy'].mean()

In [41]:
dataframe

Unnamed: 0,Model_Name,Search_Type,Mean_absolute_error,Mean_squared_error,Median_absolute_error,Accuracy,F1,R2,fit_time
0,MLPC,Grid Search CV,-0.0168706,-0.0168706,0.0,0.983129,0.933964,0.854648,51.4231
1,MLPC,Random Search CV,0.0174091,0.0174091,-0.0,0.982591,0.93171,0.850008,59.246
2,SVC,Grid Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
3,SVC,Random Search CV,0.0218963,0.0218963,-0.0,0.978104,0.911083,0.811332,3.94347
4,,,,,,,,,
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,


In [45]:
from sklearn.tree import DecisionTreeClassifier

parameters = {
    'max_depth':list(range(1,10))
            }
dtc = DecisionTreeClassifier()
model = GridSearchCV(dtc, parameters, cv=3, iid=False)
mod = cross_validate(model, X, y, cv=5, return_train_score=True, scoring=scoring)

In [46]:
# Datele pentru KNN cu Grid Search CV
dataframe['Model_Name']['4'] = 'Desicion Tree Classifier'
dataframe['Search_Type']['4'] = 'Grid Search CV'
dataframe['fit_time']['4'] = mod['fit_time'].mean()
dataframe['Median_absolute_error']['4'] = mod['test_neg_median_absolute_error'].mean()
dataframe['Mean_absolute_error']['4'] = mod['test_neg_mean_absolute_error'].mean()
dataframe['Mean_squared_error']['4'] = mod['test_neg_mean_squared_error'].mean()
dataframe['F1']['4']=mod['test_f1'].mean()
dataframe['R2']['4']=mod['test_r2'].mean()
dataframe['Accuracy']['4']=mod['test_accuracy'].mean()

In [47]:
dataframe

Unnamed: 0,Model_Name,Search_Type,Mean_absolute_error,Mean_squared_error,Median_absolute_error,Accuracy,F1,R2,fit_time
0,MLPC,Grid Search CV,-0.0168706,-0.0168706,0.0,0.983129,0.933964,0.854648,51.4231
1,MLPC,Random Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
2,SVC,Grid Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
3,SVC,Random Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
4,Desicion Tree Classifier,Grid Search CV,-0.0525854,-0.0525854,0.0,0.947415,0.769813,0.546979,0.189611
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,


In [48]:
from sklearn.tree import DecisionTreeClassifier

parameters = {
    'max_depth':list(range(1,10))
            }
dtc = DecisionTreeClassifier()
model = RandomizedSearchCV(dtc, parameters, cv=3, iid=False)
mod = cross_validate(model, X, y, cv=5, return_train_score=True, scoring=scoring)

In [51]:
# Datele pentru KNN cu Grid Search CV
dataframe['Model_Name']['5'] = 'Desicion Tree Classifier'
dataframe['Search_Type']['5'] = 'Random Search CV'
dataframe['fit_time']['5'] = mod['fit_time'].mean()
dataframe['Median_absolute_error']['5'] = mod['test_neg_median_absolute_error'].mean()
dataframe['Mean_absolute_error']['5'] = mod['test_neg_mean_absolute_error'].mean()
dataframe['Mean_squared_error']['5'] = mod['test_neg_mean_squared_error'].mean()
dataframe['F1']['5']=mod['test_f1'].mean()
dataframe['R2']['5']=mod['test_r2'].mean()
dataframe['Accuracy']['5']=mod['test_accuracy'].mean()

In [52]:
dataframe

Unnamed: 0,Model_Name,Search_Type,Mean_absolute_error,Mean_squared_error,Median_absolute_error,Accuracy,F1,R2,fit_time
0,MLPC,Grid Search CV,-0.0168706,-0.0168706,0.0,0.983129,0.933964,0.854648,51.4231
1,MLPC,Random Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
2,SVC,Grid Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
3,SVC,Random Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
4,Desicion Tree Classifier,Grid Search CV,-0.0525854,-0.0525854,0.0,0.947415,0.769813,0.546979,0.189611
5,Desicion Tree Classifier,Random Search CV,-0.0524061,-0.0524061,0.0,0.947594,0.769489,0.548511,0.189084
6,,,,,,,,,
7,,,,,,,,,


In [58]:
from sklearn.ensemble import RandomForestClassifier
parameters = {
    'n_estimators':list(range(1,30))
            }
rfc = RandomForestClassifier()
model = GridSearchCV(rfc, parameters, cv=3, iid=False)
mod = cross_validate(model, X, y, cv=5, return_train_score=True, scoring=scoring)

In [59]:
# Datele pentru KNN cu Grid Search CV
dataframe['Model_Name']['6'] = 'Random Forest Classifier'
dataframe['Search_Type']['6'] = 'Grid Search CV'
dataframe['fit_time']['6'] = mod['fit_time'].mean()
dataframe['Median_absolute_error']['6'] = mod['test_neg_median_absolute_error'].mean()
dataframe['Mean_absolute_error']['6'] = mod['test_neg_mean_absolute_error'].mean()
dataframe['Mean_squared_error']['6'] = mod['test_neg_mean_squared_error'].mean()
dataframe['F1']['6']=mod['test_f1'].mean()
dataframe['R2']['6']=mod['test_r2'].mean()
dataframe['Accuracy']['6']=mod['test_accuracy'].mean()

In [60]:
dataframe

Unnamed: 0,Model_Name,Search_Type,Mean_absolute_error,Mean_squared_error,Median_absolute_error,Accuracy,F1,R2,fit_time
0,MLPC,Grid Search CV,-0.0168706,-0.0168706,0.0,0.983129,0.933964,0.854648,51.4231
1,MLPC,Random Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
2,SVC,Grid Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
3,SVC,Random Search CV,-0.0218963,-0.0218963,0.0,0.978104,0.911083,0.811332,3.82604
4,Desicion Tree Classifier,Grid Search CV,-0.0525854,-0.0525854,0.0,0.947415,0.769813,0.546979,0.189611
5,Desicion Tree Classifier,Random Search CV,-0.0524061,-0.0524061,0.0,0.947594,0.769489,0.548511,0.189084
6,Random Forest Classifier,Grid Search CV,-0.022076,-0.022076,0.0,0.977924,0.912524,0.809774,7.48
7,,,,,,,,,


In [61]:
from sklearn.ensemble import RandomForestClassifier
parameters = {
    'n_estimators':list(range(1,30))
            }
rfc = RandomForestClassifier()
model = RandomizedSearchCV(rfc, parameters, cv=3, iid=False)
mod = cross_validate(model, X, y, cv=5, return_train_score=True, scoring=scoring)

In [62]:
# Datele pentru KNN cu Grid Search CV
dataframe['Model_Name']['7'] = 'Random Forest Classifier'
dataframe['Search_Type']['7'] = 'Random Search CV'
dataframe['fit_time']['7'] = mod['fit_time'].mean()
dataframe['Median_absolute_error']['7'] = mod['test_neg_median_absolute_error'].mean()
dataframe['Mean_absolute_error']['7'] = mod['test_neg_mean_absolute_error'].mean()
dataframe['Mean_squared_error']['7'] = mod['test_neg_mean_squared_error'].mean()
dataframe['F1']['7']=mod['test_f1'].mean()
dataframe['R2']['7']=mod['test_r2'].mean()
dataframe['Accuracy']['7']=mod['test_accuracy'].mean()

In [63]:
dataframe

Unnamed: 0,Model_Name,Search_Type,Mean_absolute_error,Mean_squared_error,Median_absolute_error,Accuracy,F1,R2,fit_time
0,MLPC,Grid Search CV,-0.0168706,-0.0168706,0,0.983129,0.933964,0.854648,51.4231
1,MLPC,Random Search CV,-0.0218963,-0.0218963,0,0.978104,0.911083,0.811332,3.82604
2,SVC,Grid Search CV,-0.0218963,-0.0218963,0,0.978104,0.911083,0.811332,3.82604
3,SVC,Random Search CV,-0.0218963,-0.0218963,0,0.978104,0.911083,0.811332,3.82604
4,Desicion Tree Classifier,Grid Search CV,-0.0525854,-0.0525854,0,0.947415,0.769813,0.546979,0.189611
5,Desicion Tree Classifier,Random Search CV,-0.0524061,-0.0524061,0,0.947594,0.769489,0.548511,0.189084
6,Random Forest Classifier,Grid Search CV,-0.022076,-0.022076,0,0.977924,0.912524,0.809774,7.48
7,Random Forest Classifier,Random Search CV,-0.0227938,-0.0227938,0,0.977206,0.910775,0.803594,2.47219
