# Text Classification

## Feature Extraction

In [1]:
vocab = {}
i=1
one, two = "/Users/test/Documents/Software-projects/Python Projects/Deep-Learning-Projects/Deep-Learning-Overfitting-Cook-Book/data/1.txt", "/Users/test/Documents/Software-projects/Python Projects/Deep-Learning-Projects/Deep-Learning-Overfitting-Cook-Book/data/2.txt"
with open(one) as f:
  x = f.read().lower().split()
  
  for word in x:
    if word in vocab:
      continue
    else:
      vocab[word] = i
      i+=1
print(vocab)

{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12}


In [2]:
with open(two) as f:
  x = f.read().lower().split()
  
  for word in x:
    if word in vocab:
      continue
    else:
      vocab[word] = i
      i+=1
print(vocab)

{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12, 'surfing': 13, 'catching': 14, 'waves': 15, 'fun': 16, 'popular': 17, 'water': 18, 'sport': 19}


## Bag Of Words

In [3]:
import pandas as pd
import numpy as np

In [6]:
SPAM_PATH = '/Users/test/Documents/Software-projects/Python Projects/Deep-Learning-Projects/Deep-Learning-Overfitting-Cook-Book/data/smsspamcollection.tsv'

In [7]:
df = pd.read_csv(SPAM_PATH, sep='\t')

In [8]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [9]:
df.shape

(5572, 4)

In [10]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

## Look at imbalances

In [15]:
from typing import Counter


target = df.values[:,0]
counter = Counter(target)
for k,v in counter.items():
  per = v/len(target) * 100
  print("Class=%s, Count=%d, Percentage=%.3f%%" % (k,v,per))

Class=ham, Count=4825, Percentage=86.594%
Class=spam, Count=747, Percentage=13.406%


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X, y = df["message"], df["label"]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
count_vect = CountVectorizer()

In [22]:
X_train_counts = count_vect.fit_transform(X_train)

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

In [24]:
tfidf = TfidfTransformer()

In [32]:
X_train_tfidf = tfidf.fit_transform(X_train_counts)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [34]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [35]:
from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X_train_tfidf, y_train)



In [36]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([("tfidf", TfidfVectorizer()), ("SVM", LinearSVC())])

text_clf.fit(X_train, y_train)



In [39]:
from sklearn.metrics import accuracy_score, classification_report


pred = text_clf.predict(X_test)

accuracy = accuracy_score(y_test, pred)

classfication_repo = classification_report(y_test, pred)
print(accuracy)
print(classfication_repo)

0.989668297988037
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [40]:
PATH = "/Users/test/Documents/Software-projects/Python Projects/Deep-Learning-Projects/Deep-Learning-Overfitting-Cook-Book/data/moviereviews.tsv"

In [41]:
df = pd.read_csv(PATH, sep="\t")

In [42]:
df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [43]:
df.isna().sum()

label      0
review    35
dtype: int64

In [44]:
df.dropna(inplace=True)

In [45]:
from typing import Counter

target = df.values[:,0]
counter = Counter(target)
for k,v in counter.items():
  per = v/len(target) * 100
  print("Class=%s, Count=%d, Percentage=%.3f%%" % (k,v,per))

Class=neg, Count=983, Percentage=50.025%
Class=pos, Count=982, Percentage=49.975%


In [46]:
blanks = []
for i,lb,rv in df.itertuples():
  if rv.isspace():
    blanks.append(i)

df.drop(blanks, inplace=True)

In [47]:
df.shape

(1938, 2)

In [48]:
X,y = df['review'], df["label"]

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [50]:
text_clf = Pipeline([("tfidf", TfidfVectorizer()), ("SVM", LinearSVC())])

text_clf.fit(X_train, y_train)

pred = text_clf.predict(X_test)

accuracy = accuracy_score(y_test, pred)

classfication_repo = classification_report(y_test, pred)
print(accuracy)
print(classfication_repo)

0.8494845360824742
              precision    recall  f1-score   support

         neg       0.83      0.86      0.85       232
         pos       0.87      0.84      0.85       253

    accuracy                           0.85       485
   macro avg       0.85      0.85      0.85       485
weighted avg       0.85      0.85      0.85       485



