# Exercises

## Take the work we did in the lessons further:

In [1]:
%autosave 0

Autosave disabled


In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report as class_rep

from env import get_connection
from adam_prepare import clean, lemmatize

In [3]:
url = get_connection('spam_db')

query = 'SELECT * FROM spam'

df = pd.read_sql(query, url, index_col='id')

df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['clean_text'] = df.text.apply(clean, args=['us'])
df.head()

Unnamed: 0_level_0,label,text,clean_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think goes usf lives around though


In [5]:
df['lemmas'] = df.clean_text.apply(lemmatize)
df.head()

Unnamed: 0_level_0,label,text,clean_text,lemmas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,dun say early hor c already say,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think goes usf lives around though,nah ' think go usf life around though


In [6]:
X = df.lemmas
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
                                                    random_state=42)

In [7]:
tfidf = TfidfVectorizer()

X_train_tfidfs = tfidf.fit_transform(X_train)
X_test_tfidfs = tfidf.transform(X_test)

X_train_tfidfs[:10]

<10x7197 sparse matrix of type '<class 'numpy.float64'>'
	with 63 stored elements in Compressed Sparse Row format>

In [8]:
sh_df= pd.DataFrame(X_train_tfidfs.todense(), columns=tfidf.get_feature_names_out())
sh_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Columns: 7197 entries, 008704050406 to zouk
dtypes: float64(7197)
memory usage: 214.1 MB


In [16]:
sh_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
008704050406,1300.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
0089my,1300.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
0121,1300.0,0.000276,0.009969,0.0,0.0,0.0,0.0,0.359427
0125698789,1300.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
02,1300.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...
zed,1300.0,0.000503,0.010516,0.0,0.0,0.0,0.0,0.234107
zero,1300.0,0.000315,0.011365,0.0,0.0,0.0,0.0,0.409784
zogtorius,1300.0,0.000290,0.010454,0.0,0.0,0.0,0.0,0.376909
zoom,1300.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000


In [17]:
# counts = sh_df.columns.value_counts()
# counts.to_csv('spam_ham_word_counts.csv',index=False)

In [18]:
lm = LogisticRegression()

lm.fit(X_train_tfidfs, y_train)

In [19]:
y_train_res = pd.DataFrame({'actual': y_train,
                            'preds': lm.predict(X_train_tfidfs)})
y_train_res.head()

Unnamed: 0_level_0,actual,preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1
708,spam,spam
4338,ham,ham
5029,ham,ham
4921,ham,ham
2592,ham,ham


In [20]:
print(class_rep(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      3372
        spam       0.99      0.74      0.85       528

    accuracy                           0.96      3900
   macro avg       0.98      0.87      0.91      3900
weighted avg       0.96      0.96      0.96      3900



In [22]:
y_test_res = pd.DataFrame({'actual': y_test,
                           'preds': lm.predict(X_test_tfidfs)})
y_test_res.head()

Unnamed: 0_level_0,actual,preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3245,ham,ham
944,ham,ham
1044,spam,ham
2484,ham,ham
812,spam,spam


In [23]:
print(class_rep(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1453
        spam       0.96      0.64      0.77       219

    accuracy                           0.95      1672
   macro avg       0.95      0.82      0.87      1672
weighted avg       0.95      0.95      0.94      1672



-------

### 1. What other types of models (i.e. different classifcation algorithms) could you use?

### 2. How do the models compare when trained on term frequency data alone, instead of TF-IDF values?