In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import pandas as pd
import numpy as np
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import re
import pickle



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Keeping stopwords, removing repeated words or multiple punctuations in a row (e.x: 'yay!!!' --> 'yay !') - only keeps '!', '?' for punctuation
def clean_text(sentences):
  split = re.findall(r"[\w']+|[!?]", sentences)
  i = 0
  while i < len(split)-1:
    if split[i] == split[i+1]:
      del split[i+1]
      i-=1
    i+= 1
  # Remove all instances of \n where only the n remains
  split = [i for i in split if i != "n"]
  return " ".join(split)

In [None]:
def classify(X_train, y_train, X_test, y_test, length, count, label):
  w = {0:count, 1:length-count}
  lr = LogisticRegression(C=100.0, random_state=1, solver='lbfgs', max_iter = 10000, multi_class='ovr', class_weight = w)

  # Use metrics.accuracy_score to measure the score
  #'macro' favors minority, 'weighted' favors majority, 'micro' favors none (when you have multiclass)
  scores = []
  precision = []
  recall = []
  for i in range(0, 10):
    #Get 9 folds for training
    xtrain = X_train[i]
    ytrain = y_train[i]
    lr.fit(xtrain, ytrain)

    xtest = X_test[i]
    ytest = y_test[i]
    y_predict = lr.predict(xtest)
    score = f1_score(ytest, y_predict)
    scores.append("%.3f" %score)
    prec = precision_score(ytest, y_predict)
    precision.append("%.3f"%prec)
    rec = recall_score(ytest, y_predict)
    recall.append(rec)
    # print(i)
  return scores, precision, recall

In [None]:
df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')
x = df['selftext'].tolist()
x[243] = "AB"
print(x[243])
# X = df.loc[0:244]
# for i, post in enumerate(X['selftext']):
#   X['selftext'][i] = clean_text(post)
#   if i == 243:
#     print(X['selftext'][i])

AB


In [None]:
df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')
def preprocess(length):
  X = df['selftext'][0:length].tolist()
  for i, post in enumerate(X):
    X[i] = clean_text(post)
  return X
def vectorize(X, ngram):
  if ngram == "binary":
    vectorizer = CountVectorizer(binary=True)
  elif ngram == "unigram":
    vectorizer = CountVectorizer()
  else:
    vectorizer = CountVectorizer(ngram_range = (2, 2), binary=True) #Bigram/binary
  bag = vectorizer.fit_transform(X)
  #len(bag) == 100 - vectors of posts
  bag = bag.toarray()
  return bag

def save_file(name, item):
  with open(name, 'wb') as f:
    pickle.dump(item, f)

X = preprocess(3500)
bag = vectorize(X, 'unigram')
save_file("3500_unigram_binaryFalse", bag)
bag = vectorize(X, 'bigram')
save_file("3500_bigram_binaryFalse", bag)
bag = vectorize(X, 'binary')
save_file("3500_binary", bag)

In [None]:
def get_folds(bag):
  Y = df['labels'][0:length].tolist()
  skf = StratifiedKFold(n_splits=10)
  skf.get_n_splits(bag, Y)
  X_train, y_train, X_test, y_test = [], [], [], []
  for i, (train_index, test_index) in enumerate(skf.split(bag, Y)):
    xtrain, ytrain, xtest, ytest = [], [], [], []
    for j in train_index:
      xtrain.append(bag[j])
      ytrain.append(Y[j])
    for j in test_index:
      xtest.append(bag[j])
      ytest.append(Y[j])
    X_train.append(xtrain)
    y_train.append(ytrain)
    X_test.append(xtest)
    y_test.append(ytest)
  return X_train, y_train, X_test, y_test

In [None]:
def open_file(name):
  with open(name, 'rb') as f:
    return pickle.load(f)
def average(lst):
  s = 0
  for i in lst:
    s += float(i)
  return s/len(lst)

In [None]:
df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')
for i in [2.0]:
  count = 0
  for ind in df.index:
    if df.loc[ind, "labels"] != i:
      df.loc[ind, "labels"] = 0.0
    else:
      df.loc[ind, "labels"] = 1.0
      count += 1
  length = 3000
  unigram, bigram, binary = [], [], []
  #~33 seconds per fold (3500 posts) --> 5.5 minutes
  while length < 3100:
    unigram_bag = open_file("3500_unigram_binaryFalse")
    X_train, y_train, X_test, y_test = get_folds(unigram_bag[0:length])
    unigram, p1, r1 = classify(X_train, y_train, X_test, y_test, length, count, i)
    bigram_bag = open_file("3500_bigram_binaryFalse")
    X_train, y_train, X_test, y_test = get_folds(bigram_bag[0:length])
    bigram, p2, r2 = classify(X_train, y_train, X_test, y_test, length, count, i)
    binary_bag = open_file("3500_binary")
    X_train, y_train, X_test, y_test = get_folds(binary_bag[0:length])
    binary, p3, r3 = classify(X_train, y_train, X_test, y_test, length, count, i)
    print(length, "posts: ")
    print("-"*15)
    print("Scores for", i, ": \n")
    print("Precision", average(p1), average(p2), average(p3))
    print("Recall", average(r1), average(r2), average(r3))
    print("Unigram F1: %.3f"%average(unigram))
    print(unigram)
    print("-"*15)
    print("Bigram F1: %.3f"%average(bigram))
    print(bigram)
    print("-"*15)
    print("Binary F1: %.3f"%average(binary))
    print(binary)
    print("-"*15)
    print("\n")
    length += 500
  df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')

['0.556', '0.385', '0.429', '0.593', '0.483', '0.400', '0.500', '0.519', '0.414', '0.562'] ['0.500', '0.500', '0.500', '0.727', '0.538', '0.429', '0.583', '0.636', '0.462', '0.562'] [0.625, 0.3125, 0.375, 0.5, 0.4375, 0.375, 0.4375, 0.4375, 0.375, 0.5625]
3000 posts: 
---------------
Scores for 2.0 : 

Precision 0.5437 0.913 0.5748
Recall 0.44375 0.2625 0.39375
Unigram F1: 0.484
['0.556', '0.385', '0.429', '0.593', '0.483', '0.400', '0.500', '0.519', '0.414', '0.562']
---------------
Bigram F1: 0.394
['0.609', '0.222', '0.435', '0.545', '0.222', '0.455', '0.300', '0.455', '0.476', '0.222']
---------------
Binary F1: 0.455
['0.529', '0.348', '0.500', '0.615', '0.387', '0.387', '0.364', '0.483', '0.370', '0.571']
---------------


