In [5]:
import nltk 
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, LinearRegression
# import warnings
# warnings.filterwarnings('ignore')

In [6]:
wordnet_lemmatizer = WordNetLemmatizer() # Converts words into the base forms (dogs and dog becomes the same word)

In [7]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [8]:
df = pd.read_csv("Reddit_Data.csv")
df = df.dropna() # Removes 100 rows (From 37249 to 37149)
df = df.head(28000)
df

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
28071,all the aunti nationals can take note don ask ...,1
28072,like everyone else too angered pulwama but ind...,1
28073,,0
28074,fucking epic learn cybersecurity from israel a...,1


In [9]:
# Loops through each comment and tokenize it and remove stop words. Also, create word index map to compute word frequencies
def tokenize_comments(df):
    tokens_list = [] 
    word_index_map = {}
    i=0
    for index, row in df.iterrows():
        tokens = nltk.tokenize.word_tokenize(row["clean_comment"])# Tokenize the comments
        tokens = [t for t in tokens if len(t) > 2]
        tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # Convert words into their base form
        tokens = [t for t in tokens if t not in stopwords] # Only adds words to the tokens if they are not stopwords and the length of the string is > 2
        for token in tokens: # Loop through and get add each token/word to word index map
            if token not in word_index_map:
                word_index_map[token] = i
                i += 1
        tokens_list.append(tokens)
    return tokens_list, word_index_map # return tokenize list and the word index map

In [10]:
# Structure and normalize each tokens list
# np.seterr(divide = 'ignore') # Ignore divide by zero warning
def normalize_tokens(tokens_list, word_index_map):
    normalized_tokens_list = []
    for token_list in tokens_list: # Loop each tokens list (comment that has been tokenized)
        x = np.zeros(len(word_index_map) + 1)
        for token in token_list: # Loop each token in the comment 
            i = word_index_map[token] # Get (first) occurence of word. Arbritray number but needed for organization
            x[i] += 1 # Increment
        x = np.divide(x,x.sum()) # Divide the frequency vector by total sum allowing for us to investigate it words total usage in the comment
        normalized_tokens_list.append(x)
    return normalized_tokens_list

In [11]:
def attach_labels(tokens_list, labels):
    for i in range(len(tokens_list)):
        tokens_list[i] = np.append(tokens_list[i], np.array(labels)[i])
    return tokens_list

In [12]:
# Tokenize comments
tokens_list, word_index_map = tokenize_comments(df)

In [13]:
# Normalize token
normalized_tokens = normalize_tokens(tokens_list, word_index_map)

  # Remove the CWD from sys.path while we load stuff.


In [14]:
# # Attach the sentiment labels back to its respective comment
new_tokens_list = attach_labels(normalized_tokens, df["category"])
# new_tokens_list

In [15]:
# Convert the list of np arrays into np mats and then into dataframe
df = pd.DataFrame(np.mat(normalized_tokens)) 
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43152,43153,43154,43155,43156,43157,43158,43159,43160,43161
0,0.050000,0.05,0.05,0.050000,0.05,0.05,0.100000,0.05,0.05,0.05,...,0.0,0.0,0.0,0.000,0.000,0.000000,0.000000,0.000,0.0,1.0
1,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.00,...,0.0,0.0,0.0,0.000,0.000,0.000000,0.000000,0.000,0.0,1.0
2,0.029412,0.00,0.00,0.029412,0.00,0.00,0.000000,0.00,0.00,0.00,...,0.0,0.0,0.0,0.000,0.000,0.000000,0.000000,0.000,0.0,-1.0
3,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.00,...,0.0,0.0,0.0,0.000,0.000,0.000000,0.000000,0.000,0.0,0.0
4,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.00,...,0.0,0.0,0.0,0.000,0.000,0.000000,0.000000,0.000,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27995,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.00,...,0.0,0.0,0.0,0.125,0.125,0.000000,0.000000,0.000,0.0,1.0
27996,0.000000,0.00,0.00,0.000000,0.00,0.00,0.017857,0.00,0.00,0.00,...,0.0,0.0,0.0,0.000,0.000,0.008929,0.008929,0.000,0.0,1.0
27997,,,,,,,,,,,...,,,,,,,,,,0.0
27998,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.00,...,0.0,0.0,0.0,0.000,0.000,0.000000,0.000000,0.125,0.0,1.0


In [16]:
# Drop na values
df = df.dropna(axis=0)

In [17]:
df_target = df[df.columns[-1]] # Extract target variable 
df = df.drop(df.columns[-1], axis = 1) # Drop target column from data

In [25]:
# Split the data
from sklearn.model_selection import train_test_split
df_train, df_test, df_train_target, df_test_target = train_test_split(df, df_target, test_size=0.2, random_state=33)

In [19]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43151,43152,43153,43154,43155,43156,43157,43158,43159,43160
363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
model = LogisticRegression()
model.fit(df_train, df_train_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
print("Classification rate", model.score(df_train, df_train_target))

Classification rate 0.7128369241864239


In [40]:
model.predict(df_test)

array([-1.,  1.,  1., ...,  1.,  0.,  1.])

In [None]:
# Function that predicts
def predict(X):
    # Can't do model.predict() bc it needs a 2d array
    

In [None]:
# Function that takes in a string and maps it to a vector according to the word index map
def to_vector(headline): 
    