## Imports

In [1]:
from unpickle_all_data import *
import nltk 
import sklearn as sk
import numpy as np
import pickle
import re
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')

Comments data: dict_keys(['X_comments', 'X_comments_sub', 'Y_comments', 'Y_comments_sub'])
Posts data:    dict_keys(['X_posts', 'X_posts_sub', 'Y_posts', 'Y_posts_sub'])


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LAPTOBEY\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\LAPTOBEY\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Getting Pickeled Data

In [2]:
X_comments = comments_data['X_comments']
Y_comments = comments_data['Y_comments']

X_comments_sub = comments_data['X_comments_sub']
Y_comments_sub = comments_data['Y_comments_sub']

X_posts = posts_data['X_posts']
Y_posts = posts_data['Y_posts']

X_posts_sub = posts_data['X_posts_sub']
Y_posts_sub = posts_data['Y_posts_sub']


## Shuffling Data

In [3]:

def groupData(arr1,arr2):
  return [[arr1[i], arr2[i]] for i in range(len(arr1))]

def ungroupData(arr):
  return [d[0] for d in arr], [d[1] for d in arr]

def shuffleData(arr):
  return sk.utils.shuffle(arr)

comments = groupData(X_comments,Y_comments)
comments_sub = groupData(X_comments_sub,Y_comments_sub)
posts = groupData(X_posts,Y_posts)
posts_sub = groupData(X_posts_sub,Y_posts_sub)

comments = shuffleData(comments)
comments_sub = shuffleData(comments_sub)
posts = shuffleData(posts)
posts_sub = shuffleData(posts_sub)

X_comments,Y_comments = ungroupData(comments)
X_comments_sub,Y_comments_sub = ungroupData(comments_sub)
X_posts,Y_posts = ungroupData(posts)
X_posts_sub,Y_posts_sub = ungroupData(posts_sub)


## Data Preprocessing Helper Functions

In [4]:
def stemming_data(data):
    stemmer = nltk.stem.WordNetLemmatizer()
    documents = []
    for i in range(len(data)):
        # Remove all the special characters
        document = re.sub(r'[^a-zA-Z0-9]', ' ', str(data[i]))
        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)
        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)
        # Converting to Lowercase
        document = document.lower() 
        # Lemmatization
        document = document.split()
        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)
  
        documents.append(document)
    return documents

def createVectorizer(training):
  vectorizer = sk.feature_extraction.text.TfidfVectorizer(input='content',max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
  vectorizer.fit_transform(training).toarray()
  return vectorizer

def vectorizeData(vectorizer,data):
    return vectorizer.transform(data).toarray()

  

## Preprocessing Data

In [5]:
documents_X_comments = stemming_data(X_comments)
documents_X_comments_sub = stemming_data(X_comments_sub)
documents_X_posts = stemming_data(X_posts)
documents_X_posts_sub = stemming_data(X_posts_sub)

vectorizer_comments = createVectorizer(documents_X_comments)
vectorizer_comments_sub = createVectorizer(documents_X_comments_sub)
vectorizer_posts = createVectorizer(documents_X_posts)
vectorizer_posts_sub = createVectorizer(documents_X_posts_sub)

weighted_X_comments = vectorizeData(vectorizer_comments,documents_X_comments)
weighted_X_comments_sub = vectorizeData(vectorizer_comments_sub,documents_X_comments_sub)
weighted_X_posts = vectorizeData(vectorizer_posts,documents_X_posts)
weighted_X_posts_sub = vectorizeData(vectorizer_posts_sub,documents_X_posts_sub)


## Pickling Data

In [7]:
def pickleData(arr, filename):
  with open(filename,'wb') as f: pickle.dump(arr, f)

def unPickleData(filename):
  with open(filename,'rb') as f: arr = pickle.load(f)
  return arr

pickleData(weighted_X_comments,os.path.join("processed_data", "comments","X_comments.pkl"))
pickleData(Y_comments,os.path.join("processed_data", "comments","Y_comments.pkl"))

pickleData(weighted_X_comments_sub,os.path.join("processed_data", "comments","X_comments_sub.pkl"))
pickleData(Y_comments_sub,os.path.join("processed_data", "comments","Y_comments_sub.pkl"))


pickleData(weighted_X_posts,os.path.join("processed_data", "posts","X_posts.pkl"))
pickleData(Y_posts,os.path.join("processed_data", "posts","Y_posts.pkl"))

pickleData(weighted_X_posts_sub,os.path.join("processed_data", "posts","X_posts_sub.pkl"))
pickleData(Y_posts_sub,os.path.join("processed_data", "posts","Y_posts_sub.pkl"))



### Data Sanity Checking

In [None]:
arr = unPickleData(os.path.join("processed_data", "comments","X_comments.pkl"))
print(np.array_equal(arr,weighted_X_comments))
arr = unPickleData(os.path.join("processed_data", "posts","Y_posts.pkl"))
print(np.array_equal(arr,Y_posts))


True
True
