In [26]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import pandas as pd
import numpy as np
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
import re
def clean_text(sentences, stopwords):
  sentences = sentences.lower()
  split = re.findall(r"[\w']+|[!?]", sentences)
  i = 0
  while i < len(split)-1:
    #Check if there are multiple exclammation/question marks and only leave 1 of them in the sentence
    if split[i] == split[i+1]:
      del split[i+1]
      i-=1
    i+= 1
  # Remove all instances of \n where only the n remains
  split = [i for i in split if i != "n"]
  return " ".join(split)

In [32]:
def preprocess(length, ngram, d):
  X = df['selftext'][0:length].tolist()
  Y = df['labels'][0:length].tolist()
  for i, post in enumerate(X):
    X[i] = clean_text(post)
  if ngram == "binary":
    vectorizer = CountVectorizer(vocabulary = d, binary=True)
  elif ngram == "unigram":
    vectorizer = CountVectorizer(vocabulary = d)
  else:
    vectorizer = CountVectorizer(vocabulary = d, ngram_range = (2, 2), binary=False) #Bigram/binary
  bag = vectorizer.fit_transform(X)
  #len(bag) == 100 - vectors of posts
  bag = bag.toarray()
  skf = StratifiedKFold(n_splits=10)
  skf.get_n_splits(bag, Y)
  X_train, y_train, X_test, y_test = [], [], [], []
  for i, (train_index, test_index) in enumerate(skf.split(bag, Y)):
    xtrain, ytrain, xtest, ytest = [], [], [], []
    for j in train_index:
      xtrain.append(bag[j])
      ytrain.append(Y[j])
    for j in test_index:
      xtest.append(bag[j])
      ytest.append(Y[j])
    X_train.append(xtrain)
    y_train.append(ytrain)
    X_test.append(xtest)
    y_test.append(ytest)
  return X_train, y_train, X_test, y_test
  #for 100 posts:
  #len(X_train, y_train) == 10, each of the 10 folds have 90 np vector arrays (posts)
  #len(X_test, y_test) == 10, 10 np arrays per fold - 100 posts total
# print(X_test)
# print(y_test)
# print(len(X_test))
# print(X_test[0])
# print(len(X_test[0]))

In [7]:
def classify(X_train, y_train, X_test, y_test, length, count, label):
  w = {0:count, 1:length-count}
  lr = LogisticRegression(C=100.0, random_state=1, solver='lbfgs', multi_class='ovr', max_iter = 1000000, class_weight = w)

  # Use metrics.accuracy_score to measure the score
  #'macro' favors minority, 'weighted' favors majority, 'micro' favors none (when you have multiclass)
  scores = []
  precision = []
  recall = []
  for i in range(0, 10):
    #Get 9 folds for training
    xtrain = X_train[i]
    ytrain = y_train[i]
    lr.fit(xtrain, ytrain)

    xtest = X_test[i]
    ytest = y_test[i]
    y_predict = lr.predict(xtest)
    if label in [1.0, 4.0]:
      score = f1_score(ytest, y_predict, labels=np.unique(y_predict))
    else:
      score = f1_score(ytest, y_predict, labels=np.unique(y_predict))
    scores.append("%.3f" %score)
    prec = precision_score(ytest, y_predict)
    precision.append("%.3f"%prec)
    rec = recall_score(ytest, y_predict)
    recall.append(rec)
  return lr, scores, precision, recall

In [29]:
#Making dictionary to put as the argument for the "vocabulary" parameter in CountVectorizer - need this to get words that the model has not yet seen
df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')
d=[]
count = 0
for i in df['selftext']:
  for word in i.split():
    if word not in d:
      d.append(word)
print(len(d)) #41716

def average(lst):
  s = 0
  for i in lst:
    s += float(i)
  return s/len(lst)

41716


In [33]:
df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')
for i in [2.0]:
  count = 0
  for ind in df.index:
    if df.loc[ind, "labels"] != i:
      df.loc[ind, "labels"] = 0
    else:
      df.loc[ind, "labels"] = 1
      count += 1
  length = 3500
  unigram, bigram, binary = [], [], []
  while length < 3600:
    X_train, y_train, X_test, y_test = preprocess(length, "unigram", d)
    lr1, unigram, p1, r1 = classify(X_train, y_train, X_test, y_test, length, count, i)
    X_train, y_train, X_test, y_test = preprocess(length, "bigram", d)
    lr2, bigram, p2, r2 = classify(X_train, y_train, X_test, y_test, length, count, i)
    X_train, y_train, X_test, y_test = preprocess(length, "binary", d)
    lr3, binary, p3, r3 = classify(X_train, y_train, X_test, y_test, length, count, i)
    print(length, "posts: ")
    print("-"*15)
    print("Scores for", i, ": \n")
    print("Precision", average(p1), average(p2), average(p3))
    print("Recall", average(r1), average(r2), average(r3))
    print("Unigram F1: %.3f"%average(unigram))
    print(unigram)
    print("-"*15)
    print("Bigram F1: %.3f"%average(bigram))
    print(bigram)
    print("-"*15)
    print("Binary F1: %.3f"%average(binary))
    print(binary)
    print("-"*15)
    print("\n")
    length += 500
  df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


3500 posts: 
---------------
Scores for 2.0 : 

Precision 0.47850000000000004 0.035699999999999996 0.5088
Recall 0.36052631578947375 0.7 0.3283625730994152
Unigram F1: 0.405
['0.556', '0.357', '0.438', '0.471', '0.370', '0.343', '0.286', '0.400', '0.486', '0.343']
---------------
Bigram F1: 0.069
['0.098', '0.098', '0.098', '0.098', '0.098', '0.098', '0.098', '0.000', '0.000', '0.000']
---------------
Binary F1: 0.390
['0.432', '0.333', '0.452', '0.471', '0.345', '0.414', '0.357', '0.296', '0.375', '0.424']
---------------




In [34]:
def a(ngram):
  vectorizer = 0
  if ngram == "binary":
    vectorizer = CountVectorizer(vocabulary=d, binary=True)
  elif ngram == "unigram":
    vectorizer = CountVectorizer(vocabulary=d)
  else: #bigram
    vectorizer = CountVectorizer(vocabulary=d, ngram_range = (2, 2))
  return vectorizer

def b(vectorizer):
  X = np.array(df['selftext'][3100:7751].tolist())
  for i, post in enumerate(X):
    X[i] = clean_text(post)

  bag = vectorizer.fit_transform(X)
  bag = bag.toarray()
  return bag

count = 0
i = 2.0
for ind in df.index:
  if df.loc[ind, "labels"] != i:
      df.loc[ind, "labels"] = 0
  else:
    df.loc[ind, "labels"] = 1
    count += 1

In [35]:
#Getting number of advice posts (1.0 for advice, 0.0 for anything else)
vectorizer = a("unigram")
data = b(vectorizer)
res1 = lr1.predict(data)
print(np.count_nonzero(res1 == 1.0))

vectorizer = a("bigram")
data = b(vectorizer)
res2 = lr2.predict(data)
print(np.count_nonzero(res2 == 1.0))

vectorizer = a("binary")
data = b(vectorizer)
res3 = lr3.predict(data)
print(np.count_nonzero(res3 == 1.0))



165




0




148


In [None]:
import csv
df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')

def p(x):
  l = []
  for ind, pred in enumerate(x):
    if pred == 1.0:
      l.append(ind+3100)
  return l
l1 = p(res1)
l3 = p(res3)
prediction1 = []
prediction2 = []
for i in l1:
  prediction1.append(df['selftext'][i])
for i in l3:
  prediction2.append(df['selftext'][i])
for i in range(11):
  prediction2.append("")

c = []
for i in l1:
  if l3.count(i) > 0:
    c.append(df['selftext'][i])
for i in range(57):
  c.append("")
c.append(l1)
c.append(l3)
print(len(prediction1), len(prediction2), len(c))
d = {'unigram': prediction1, 'binary': prediction2, 'both': c}

df = pd.DataFrame(d)

# saving the dataframe
df.to_csv('predictions.csv')

In [None]:
X = np.array(df['selftext'][501:3500].tolist())
bag = vectorizer.fit_transform(X)
bag = bag.toarray()
y_predict = lr2.predict(bag)
ytest = np.array(df['labels'][501:3500].tolist())
score = f1_score(ytest, y_predict, labels=np.unique(y_predict))
print(score)

In [None]:
X = np.array(df['selftext'][1:100].tolist())
vectorizer = CountVectorizer()
bag = vectorizer.fit_transform(X)
print(bag.toarray()[0])

[0 0 0 ... 0 0 0]
