In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def clean_text(sentences):
  cleaned_sentence = ""
  sentences = sentences.lower().split()
  ##removing stop words
  words = [i for i in sentences if i not in stopwords.words('english')]
  words = " ".join(words)               ##joining our words back to sentences
  return words

In [None]:
def preprocess(length, ngram):
  X = df['selftext'][0:length].tolist()
  Y = df['labels'][0:length].tolist()
  for i, post in enumerate(X):
    X[i] = clean_text(post)
  if ngram == "binary":
    vectorizer = CountVectorizer(binary=True)
  elif ngram == "unigram":
    vectorizer = CountVectorizer()
  else:
    vectorizer = CountVectorizer(ngram_range = (2, 2), binary=False) #Bigram/binary
  bag = vectorizer.fit_transform(X)
  #len(bag) == 100 - vectors of posts
  bag = bag.toarray()
  skf = StratifiedKFold(n_splits=10)
  skf.get_n_splits(bag, Y)
  X_train, y_train, X_test, y_test = [], [], [], []
  for i, (train_index, test_index) in enumerate(skf.split(bag, Y)):
    xtrain, ytrain, xtest, ytest = [], [], [], []
    for j in train_index:
      xtrain.append(bag[j])
      ytrain.append(Y[j])
    for j in test_index:
      xtest.append(bag[j])
      ytest.append(Y[j])
    X_train.append(xtrain)
    y_train.append(ytrain)
    X_test.append(xtest)
    y_test.append(ytest)
  return X_train, y_train, X_test, y_test
  #for 100 posts:
  #len(X_train, y_train) == 10, each of the 10 folds have 90 np vector arrays (posts)
  #len(X_test, y_test) == 10, 10 np arrays per fold - 100 posts total
# print(X_test)
# print(y_test)
# print(len(X_test))
# print(X_test[0])
# print(len(X_test[0]))

In [None]:
def classify(X_train, y_train, X_test, y_test, length, count, label):
  w = {0:length-count, 1:count}
  lr = LogisticRegression(C=100.0, random_state=1, solver='lbfgs', multi_class='ovr', max_iter = 1000000, class_weight = w)

  # Use metrics.accuracy_score to measure the score
  #'macro' favors minority, 'weighted' favors majority, 'micro' favors none (when you have multiclass)
  scores = []
  for i in range(0, 10):
    #Get 9 folds for training
    xtrain = X_train[i]
    ytrain = y_train[i]
    lr.fit(xtrain, ytrain)

    xtest = X_test[i]
    ytest = y_test[i]
    y_predict = lr.predict(xtest)
    if label in [1.0, 4.0]:
      score = f1_score(ytest, y_predict, average='micro', labels=np.unique(y_predict))
    else:
      score = f1_score(ytest, y_predict, average='macro', labels=np.unique(y_predict))
    scores.append("%.3f" %score)
  return scores

In [None]:
df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')
# for i in list(set(df.labels))[:6]:
for i in [1.0, 2.0, 3.0, 4.0]:
  count = 0
  for ind in df.index:
    # float 0.0, 1.0, 2.0, 3.0, 4.0, 5.0 - represents: general, question, advice, encouragement, experience, bragging posts
    if df.loc[ind, "labels"] != i:
      df.loc[ind, "labels"] = 0
    else:
      df.loc[ind, "labels"] = 1
      count += 1
  length = 500
  unigram, bigram, binary = [], [], []
  while length < 600:
    X_train, y_train, X_test, y_test = preprocess(length, "unigram")
    unigram = classify(X_train, y_train, X_test, y_test, length, count, i)
    X_train, y_train, X_test, y_test = preprocess(length, "bigram")
    bigram = classify(X_train, y_train, X_test, y_test, length, count, i)
    X_train, y_train, X_test, y_test = preprocess(length, "binary")
    binary = classify(X_train, y_train, X_test, y_test, length, count, i)
    length += 500
  print(str(length) + " posts: ")
  print("Scores for", i, ": \n")
  sum = 0
  for i in unigram:
      print(i)
      sum += float(i)
  print("Average: " + str(sum/10))
  print("-----------------------")
  sum = 0
  for i in bigram:
    print(i)
    sum += float(i)
  print("Average: " + str(sum/10))
  print("-----------------------")
  sum = 0
  for i in binary:
    print(i)
    sum += float(i)
  print("Average: " + str(sum/10))
  print("\n")
  df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')

In [None]:
#test to see how many of each post there are
import pandas as pd
df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')
labels = list(df.labels)
questions = labels.count(1.0)
advice = labels.count(2.0)
encouragement  = labels.count(3.0)
experience = labels.count(4.0)
bragging = labels.count(5.0)
print(questions, advice, encouragement, experience, bragging)
print(labels.count(0.0))
ratio = {0:2000-experience, 1:experience}
print(ratio)

1046 103 90 675 25
61
{0: 1325, 1: 675}


A finalized version of the code below is in the "Predicting unlabeled posts" colab file/github link

In [None]:
df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')
import string
import re
d = {}
punct = string.punctuation
#Getting rid of 3 punctuations - e.x: pre-process, it's, 7.5, etc.
punct = re.sub("[-'.]", "", punct)
#Setting all the punctuation counts to 0
for i in punct:
  d[i] = 0

count = 0
for i in df['selftext']:
  i = clean_text(i)
  for word in i.split():
    new = False
    for char in list(set(re.findall(f"[{punct}]", word))):
      d[char] += 1
    for item in re.split(f"[{punct}]", word):
      if item != "":
        new = True
        word = item
        break
    if new:
      if word not in d:
        d[word] = 0
      else:
        d[word] += 1

print(len(d)) #24889

24889


In [None]:
def a(ngram):
  vectorizer = 0
  if ngram == "binary":
    vectorizer = CountVectorizer(vocabulary=d, binary=True)
  elif ngram == "unigram":
    vectorizer = CountVectorizer(vocabulary=d)
  else: #bigram
    vectorizer = CountVectorizer(vocabulary=d, ngram_range = (2, 2))
  return vectorizer

def b(vectorizer):
  X = np.array(df['selftext'][3100:7751].tolist())
  for i, post in enumerate(X):
    X[i] = clean_text(post)

  bag = vectorizer.fit_transform(X)
  bag = bag.toarray()
  return bag

count = 0
i = 2.0
for ind in df.index:
  if df.loc[ind, "labels"] != i:
      df.loc[ind, "labels"] = 0
  else:
    df.loc[ind, "labels"] = 1
    count += 1

vectorizer = a("unigram")
unigram, lr1 = test(3000, "unigram", count, i)
# bigram, lr2  = test(3000, "bigram", count, i)
# vectorizer = a("binary")
# binary, lr3 = test(3000, "binary", count, i)

bag = b(vectorizer)
data = np.array(bag)

print(lr1.predict(data))
# print(lr2.predict(data))
# print(lr3.predict(data))