# 1. Loading Libraries and Data


In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from pathlib import Path
from google.colab import drive
import os

import re
import string

# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

In [None]:
import locale
print(locale.getpreferredencoding())
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

UTF-8


In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Set the directory to your specific folder in Google Drive
folder_path = "/content/drive/My Drive/Colab Notebooks/MMAI 891 NLP/"

# Check if the directory exists and print the list of files and folders inside it
if os.path.exists(folder_path):
    files = os.listdir(folder_path)
    print(files)
else:
    print("Directory does not exist.")

root_dir = "/content/drive/My Drive/" 
# choose where you want your project files to be saved
project_folder = "Colab Notebooks/MMAI 891 NLP/"
os.chdir(root_dir + project_folder)
!pwd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['Project Brief - Zindi Classification.docx', 'Test.csv', 'Train.csv', 'SampleSubmission.csv', 'NLP_Primer_twitter_challenge.ipynb', 'MMAI 891 NLP Project ZINDI Shallow ML.ipynb']
/content/drive/My Drive/Colab Notebooks/MMAI 891 NLP


In [None]:
# Load the data
train = pd.read_csv('Train.csv').dropna(0) # Read in train, ignoring one row with missing data
test = pd.read_csv('Test.csv').fillna('') # Read in test
test['label']=0 # We'll fill this in with predictions later
train.head() # Take a peek at the data

  train = pd.read_csv('Train.csv').dropna(0) # Read in train, ignoring one row with missing data


Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [None]:
train.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [None]:
test.isnull().sum()

tweet_id     0
safe_text    0
label        0
dtype: int64

In [None]:
train['label'].value_counts()

 0.0    4908
 1.0    4053
-1.0    1038
Name: label, dtype: int64

# 2.Data Preprocessing

In [None]:
import nltk
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
train_df = train.copy()

In [None]:
# Applying a first round of text cleaning techniques

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to both test and training datasets
train_df['safe_text'] = train_df['safe_text'].apply(lambda x: clean_text(x))
# test_df['text'] = test['text'].apply(lambda x: clean_text(x))

# Let's take a look at the updated text
train_df['safe_text'].head()

0    me amp the big homie  meanboy mb mbs mmr stegm...
1    im  thinking of devoting my career to proving ...
2    whatcausesautism vaccines do not vaccinate you...
3    i mean if they immunize my kid with something ...
4    thanks to  catch me performing at la nuit nyc ...
Name: safe_text, dtype: object

In [None]:
text = "Are you coming , aren't you"
tokenizer1 = nltk.tokenize.WhitespaceTokenizer()
tokenizer2 = nltk.tokenize.TreebankWordTokenizer()
tokenizer3 = nltk.tokenize.WordPunctTokenizer()
tokenizer4 = nltk.tokenize.RegexpTokenizer(r'\w+')

print("Example Text: ",text)
print("------------------------------------------------------------------------------------------------")
print("Tokenization by whitespace:- ",tokenizer1.tokenize(text))
print("Tokenization by words using Treebank Word Tokenizer:- ",tokenizer2.tokenize(text))
print("Tokenization by punctuation:- ",tokenizer3.tokenize(text))
print("Tokenization by regular expression:- ",tokenizer4.tokenize(text))

Example Text:  Are you coming , aren't you
------------------------------------------------------------------------------------------------
Tokenization by whitespace:-  ['Are', 'you', 'coming', ',', "aren't", 'you']
Tokenization by words using Treebank Word Tokenizer:-  ['Are', 'you', 'coming', ',', 'are', "n't", 'you']
Tokenization by punctuation:-  ['Are', 'you', 'coming', ',', 'aren', "'", 't', 'you']
Tokenization by regular expression:-  ['Are', 'you', 'coming', 'aren', 't', 'you']


In [None]:
# Tokenizing the training and the test set
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
train_df['safe_text'] = train_df['safe_text'].apply(lambda x: tokenizer.tokenize(x))
# test['text'] = test['text'].apply(lambda x: tokenizer.tokenize(x))
train_df['safe_text'].head()

0    [me, amp, the, big, homie, meanboy, mb, mbs, m...
1    [im, thinking, of, devoting, my, career, to, p...
2    [whatcausesautism, vaccines, do, not, vaccinat...
3    [i, mean, if, they, immunize, my, kid, with, s...
4    [thanks, to, catch, me, performing, at, la, nu...
Name: safe_text, dtype: object

In [None]:
def remove_stopwords(text):
    """
    Removing stopwords belonging to english language
    
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words


train_df['safe_text'] = train_df['safe_text'].apply(lambda x : remove_stopwords(x))
# test_df['safe_text'] = test['text'].apply(lambda x : remove_stopwords(x))
train_df.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,"[amp, big, homie, meanboy, mb, mbs, mmr, stegm...",0.0,1.0
1,E3303EME,"[im, thinking, devoting, career, proving, auti...",1.0,1.0
2,M4IVFSMS,"[whatcausesautism, vaccines, vaccinate, child]",-1.0,1.0
3,1DR6ROZ4,"[mean, immunize, kid, something, wont, secretl...",-1.0,1.0
4,J77ENIIE,"[thanks, catch, performing, la, nuit, nyc, ave...",0.0,1.0


In [None]:
import nltk

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Stemming and Lemmatization examples: Not always improve reslts, so will not use first. Will try to see after
text = "feet cats wolves talked"

tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)

# Stemmer
stemmer = nltk.stem.PorterStemmer()
print("Stemming the sentence: ", " ".join(stemmer.stem(token) for token in tokens))

# Lemmatizer
lemmatizer=nltk.stem.WordNetLemmatizer()
print("Lemmatizing the sentence: ", " ".join(lemmatizer.lemmatize(token) for token in tokens))

Stemming the sentence:  feet cat wolv talk
Lemmatizing the sentence:  foot cat wolf talked


In [None]:
# After preprocessing, the text format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

train_df['safe_text'] = train_df['safe_text'].apply(lambda x : combine_text(x))
# test_df['safe_text'] = test_df['safe_text'].apply(lambda x : combine_text(x))
train_df['safe_text']
train_df.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,amp big homie meanboy mb mbs mmr stegmanlife s...,0.0,1.0
1,E3303EME,im thinking devoting career proving autism isn...,1.0,1.0
2,M4IVFSMS,whatcausesautism vaccines vaccinate child,-1.0,1.0
3,1DR6ROZ4,mean immunize kid something wont secretly kill...,-1.0,1.0
4,J77ENIIE,thanks catch performing la nuit nyc ave show s...,0.0,1.0


In [None]:
# text preprocessing function combining all the steps 
def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(remove_stopwords)
    return combined_text

# 3. Tokenizing

Bag of words

In [None]:
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(train_df['safe_text'])
# test_vectors = count_vectorizer.transform(test_df["safe_text"])

## Keeping only non-zero elements to preserve space 
print(train_vectors[0].todense())

[[0 0 0 ... 0 0 0]]


TFIDF Features

TF = (Number of times term t appears in a document)/(Number of terms in the document)

IDF = 1+log(N/n), where, N is the number of documents and n is the number of documents a term t has appeared in.

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train_df['safe_text'])
# test_tfidf = tfidf.transform(test_df["safe_text"])

# 4. Basic Model

In [None]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0, max_iter=1000, multi_class='auto', verbose=1)
scores = model_selection.cross_val_score(clf, train_vectors, train_df["label"], cv=5)
scores

# [None, 'micro', 'macro', 'weighted']

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s finished


array([0.7065    , 0.7245    , 0.7295    , 0.7265    , 0.71585793])

In [None]:
clf.fit(train_vectors, train_df["label"])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s finished


In [None]:
# Fitting a simple Logistic Regression on TFIDF
clf_tfidf = LogisticRegression(C=1.0, max_iter=1000, multi_class='auto')
scores = model_selection.cross_val_score(clf_tfidf, train_tfidf, train_df["label"], cv=5)
scores

array([0.721     , 0.74      , 0.7315    , 0.7325    , 0.71935968])