# Modeling

In [1]:
from pprint import pprint
import re

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import acquire #basic_clean, lemmatize
import prepare


<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


**Get and prep data**

In [2]:
df = acquire.get_news_articles()
df.head()

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


Unnamed: 0,title,content,category
0,Antfin transfers 10.3% stake to Paytm chief Vi...,Antfin (Netherlands) Holding BV has transferre...,business
1,"Nepal asks India for rice, sugar to avert poss...",Nepal government has requested India to facili...,business
2,GQG Partners buys 8.1% stake in Adani Power fo...,Investment firm GQG Partners bought an 8.1% st...,business
3,"USDA cuts rice trade forecast for 2023, 2024 p...",US Department of Agriculture (USDA) lowered th...,business
4,Hyundai to buy General Motors' Talegaon plant ...,Hyundai Motor India signed an asset purchase a...,business


In [3]:
document = " ".join(df.content)
# clean up the text
document = re.sub(r"[^a-z0-9'\s]", "", document)
# transform into a series
words = pd.Series(document.split())

In [4]:
words

0             ntfin
1        etherlands
2            olding
3               has
4       transferred
           ...     
2303          three
2304         months
2305             of
2306            the
2307           year
Length: 2308, dtype: object

## Feature Extraction: TF-IDF

- TF: Term Frequency; how often a word appears in a document.
- IDF: Inverse Documnet Frequency; a measure based on in how many documents will a word appear.
- TF-IDF: A combination of the two measures above.

## Term Frequency (TF)

Term frequency can be calculated in a number of ways, all of which reflect how frequently a word appears in a document.

- Raw Count: This is simply the count of the number of occurances of each word.
- Frequency: The number of times each word appears divided by the total number of words.
- Augmented Frequency: The frequency of each word divided by the maximum frequency. This can help prevent bias towards larger documents.

In [5]:
# the count of the number of occurances of each word.
word_df = pd.DataFrame(words.value_counts().index, columns=["word"]).assign(raw_count= words.value_counts().values)
# The number of times each word appears divided by the total number of words.
frequency = word_df.raw_count / len(words)
# The frequency of each word divided by the maximum frequency. 
augmented_frequency = frequency / frequency.max()
# add to the dataframe
word_df["frequency"] = frequency
word_df["augmented_frequency"] = augmented_frequency
word_df.head()

Unnamed: 0,word,raw_count,frequency,augmented_frequency
0,to,80,0.034662,1.0
1,of,64,0.02773,0.8
2,the,60,0.025997,0.75
3,in,56,0.024263,0.7
4,and,52,0.02253,0.65


## Inverse Document Frequency (IDF) (must have multiple ducuments)

- A higher IDF means that a word provides more information. That is, it is more relevant within a single document.

Inverse Document Frequency tells us how much information a word provides. It is based on how commonly a word appears across multiple documents. The metric is divised such that the more frequently a word appears, the lower the IDF for that word will be.

In [37]:
#  Create my own documents from the article dataframe
document1 = df.content[:len(df.content)//3]
document2 = df.content[len(document1):len(document1)*2]
document3 = df.content[len(document1)*2:]
document1.shape,document2.shape,document3.shape

((13,), (13,), (14,))

In [7]:
document1

0     Antfin (Netherlands) Holding BV has transferre...
1     Nepal government has requested India to facili...
2     Investment firm GQG Partners bought an 8.1% st...
3     US Department of Agriculture (USDA) lowered th...
4     Hyundai Motor India signed an asset purchase a...
5     Combined remuneration for the heads of the Nif...
6     H&M is investigating 20 alleged instances of l...
7     A glitch in Bank of Ireland's app allowed cust...
8     Union Cabinet approved a ₹32,500-crore budget ...
9     The Dutch economy has entered a recession as i...
10    Antfin (Netherlands) Holding BV has transferre...
11    Nepal government has requested India to facili...
12    Investment firm GQG Partners bought an 8.1% st...
Name: content, dtype: object

In [32]:
# our 3 example documents
documents = {
    'document1': " ".join(document1),
    'document2': " ".join(document2),
    'document3': " ".join(document3)
}

for doc, topic in documents.items():
    # clean and lemmatize the data and join them back by space
    documents[doc] = " ".join(prepare.prep_data(topic))
    print('Cleaning and lemmatizing...\n')

Cleaning and lemmatizing...

Cleaning and lemmatizing...

Cleaning and lemmatizing...



In [36]:
# A simple way to calculate idf for demonstration. Note that this
# function relies on the globally defined documents variable.
def idf(word):            
    n_occurences = sum([1 for doc in documents.values() if word in doc])
    return len(documents) / n_occurences + 1

# Get a list of the unique words
unique_words = pd.Series(' '.join(documents.values()).split()).unique()

# put the unique words into a data frame
(pd.DataFrame(dict(word=unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False)
 .head(5))


Unnamed: 0_level_0,idf
word,Unnamed: 1_level_1
antfin,2.0
february,2.0
case,2.0
156,2.0
tracked,2.0


## TF-IDF

In [38]:
tfs = []

# We'll caclulate the tf-idf value for every word across every document

# Start by iterating over all the documents
for doc, text in documents.items():
    # We'll make a data frame that contains the tf for every word in every document
    df = (pd.Series(text.split())
          .value_counts()
          .reset_index()
          .set_axis(['word', 'raw_count'], axis=1)
          .assign(tf=lambda df: df.raw_count / df.shape[0])
          .drop(columns='raw_count')
          .assign(doc=doc))
    # Then add that data frame to our list
    tfs.append(df)

# We'll then concatenate all the tf values together.
(pd.concat(tfs)
 # calculate the idf value for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # then use the if and idf values to calculate tf-idf 
 .assign(tf_idf=lambda df: df.idf * df.tf)
 .drop(columns=['tf', 'idf'])
 .sort_values(by='tf_idf', ascending=False))


Unnamed: 0,word,doc,tf_idf
0,india,document2,0.079734
1,tonne,document2,0.059801
1,tonne,document1,0.059801
0,india,document1,0.059801
2,crore,document2,0.053156
...,...,...,...
191,mitigate,document2,0.006645
192,possible,document2,0.006645
193,food,document2,0.006645
194,scarcity,document2,0.006645


In [39]:
# We'll then concatenate all the tf values together.
(pd.concat(tfs)
 # calculate the idf value for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # then use the if and idf values to calculate tf-idf 
 .assign(tf_idf=lambda df: df.idf * df.tf)
 .drop(columns=['tf', 'idf'])
 .sort_values(by='tf_idf', ascending=False)
 .pipe(lambda df: pd.crosstab(df.doc, df.word, values=df.tf_idf, aggfunc=lambda x: x))
 .fillna(0))


word,03,04,10,1000,100000,103,1066,11,1349,14,...,wednesday,west,white,withdrawn,work,worker,worldwide,would,year,zone
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
document1,0.006645,0.006645,0.006645,0.006645,0.013289,0.013289,0.006645,0.013289,0.013289,0.013289,...,0.019934,0.006645,0.006645,0.013289,0.006645,0.006645,0.013289,0.006645,0.013289,0.006645
document2,0.006645,0.006645,0.013289,0.006645,0.006645,0.006645,0.013289,0.006645,0.006645,0.006645,...,0.013289,0.006645,0.013289,0.013289,0.006645,0.006645,0.006645,0.006645,0.019934,0.006645
document3,0.013289,0.013289,0.006645,0.013289,0.006645,0.006645,0.006645,0.006645,0.006645,0.006645,...,0.019934,0.013289,0.006645,0.026578,0.013289,0.013289,0.006645,0.013289,0.019934,0.013289


# Modeling Wiht scikit-learn

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import env

In [51]:
# tfidf = TfidfVectorizer()
# tfidfs = tfidf.fit_transform(documents.values())
# tfidfs


In [52]:
# pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names_out())


In [50]:
url = env.get_db_access("spam_db")
sql = "SELECT * FROM spam"

df = pd.read_sql(sql, url, index_col="id")
df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [54]:

X = df.text
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [55]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)


In [56]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))


Accuracy: 97.51%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3857   109
spam          2   489
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      3859
        spam       1.00      0.82      0.90       598

    accuracy                           0.98      4457
   macro avg       0.98      0.91      0.94      4457
weighted avg       0.98      0.98      0.97      4457



In [57]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))


Accuracy: 97.04%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        963    30
spam         3   119
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       0.98      0.80      0.88       149

    accuracy                           0.97      1115
   macro avg       0.97      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

