# **Loading the data**

In [0]:
import numpy as np
import sklearn
import pandas as pd

In [3]:
from google.colab import files
files.upload()

Saving data_train.pkl to data_train (2).pkl


KeyboardInterrupt: ignored

In [0]:
train_data = np.load('data_train.pkl', allow_pickle=True)
test_data = np.load('data_test.pkl', allow_pickle=True)


# **Data exploration**

In [5]:
#number of samples
print("There are",len(train_data[0]),"samples" )
#number of classes
print("There are", len(np.unique(train_data[1])), "classes")
#number of samples per class
samples_per_class = np.zeros((20))
for index, subreddit in enumerate(np.unique(train_data[1])):
  for i in range(len(train_data[1])):
    if train_data[1][i] == subreddit:
      samples_per_class[index] += 1   

print("There are", 3500, "samples per class")

#median number of words per sample
def num_words_per_sample(sample_texts):
  num_words = [len(s.split()) for s in sample_texts]
  return np.median(num_words)

print("The median of words per sample is", int(num_words_per_sample(train_data[0])))

print("The number of samples/number of words per sample ratio is", len(train_data[0])/ num_words_per_sample(train_data[0]))


There are 70000 samples
There are 20 classes
There are 3500 samples per class
The median of words per sample is 25
The number of samples/number of words per sample ratio is 2800.0


# **Data preprocessing**

In [0]:
from sklearn.model_selection import train_test_split

(reddit_train_x, reddit_test_x, reddit_train_y, reddit_test_y) = train_test_split(train_data[0], train_data[1], test_size = 0.2)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.naive_bayes import MultinomialNB

top_k = 35000

def ngram_vectorize(training_comments, training_labels, testing_comments):
  vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode', stop_words='english', ngram_range=(1,2), analyzer='word', min_df=2)

  # Learn vocabulary from training texts and vectorize training texts
  x_train = vectorizer.fit_transform(training_comments)

  # Vectorize testing set
  x_test = vectorizer.transform(testing_comments)

  # Select top 'k' of the vectorized features.
  selector = SelectKBest(f_classif, k=min(top_k, x_train.shape[1]))
  selector.fit(x_train, training_labels)
  x_train = selector.transform(x_train).astype('float32')
  x_test = selector.transform(x_test).astype('float32')
  return x_train, x_test

x_train, x_test = ngram_vectorize(reddit_train_x, reddit_train_y, reddit_test_x)

# **Models**

In [9]:
# Naive Bayes with top k n-gram vectors
clf = MultinomialNB().fit(x_train, reddit_train_y)
predict = clf.predict(x_test)
np.mean(predict == reddit_test_y)

0.5560714285714285

In [0]:
# SVM
from sklearn.linear_model import SGDClassifier

svm = SGDClassifier(loss="hinge", penalty="l2", alpha=1e-3, random_state=42, max_iter=30, tol=None).fit(x_train, reddit_train_y)
predict_svm = svm.predict(x_test)
np.mean(predict_svm == reddit_test_y)

0.5495

In [0]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

logistic = LogisticRegression().fit(x_train, reddit_train_y)
predict_logistic = logistic.predict(x_test)
score = logistic.score(x_test, reddit_test_y)
print(score)



0.5385


# **Pre-trained embeddings**

In [0]:
!pip install --quiet gensim

# To add to Drive
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

In [11]:
import gensim.downloader as api
path_word2vec = api.load("word2vec-google-news-300", return_path=True)
print(path_word2vec)

/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [12]:
from gensim.models import KeyedVectors
# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format(path_word2vec, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
import string 
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def make_new_data(data_input):
  new_data = ['' for i in range(len(data_input))]
  for index, doc in enumerate(data_input):
    doc = doc.lower()
    doc = doc.translate(str.maketrans('', '', string.punctuation))
    for word in doc.split():
      if word not in stop_words:
        new_data[index] += word + " "
  return new_data

new_train_data = make_new_data(reddit_train_x)
new_test_data = make_new_data(reddit_test_x)


In [18]:

# Access vectors for specific words with a keyed lookup:
vector = model['easy']
# see the shape of the vector (300,)
vector.shape
# Processing sentences is not as simple as with Spacy:

vectors = [0] * 300
sentence = "This is some text I am processing with Spacy SHSHSHSHS"
count = 0
for word in sentence.split(' '):
  if word in model.vocab:
    vectors += model[word]
    count += 1
  else:
    print(word, "not in sentence")
vectors /= count

print(vectors.shape)

SHSHSHSHS not in sentence
(300,)


In [19]:
model.most_similar("simple")

  if np.issubdtype(vec.dtype, np.int):


[('straightforward', 0.7460168600082397),
 ('Simple', 0.7108173966407776),
 ('uncomplicated', 0.6297484636306763),
 ('simplest', 0.6171397566795349),
 ('easy', 0.5990299582481384),
 ('fairly_straightforward', 0.5893307328224182),
 ('deceptively_simple', 0.5743066072463989),
 ('simpler', 0.5537199378013611),
 ('simplistic', 0.5516539216041565),
 ('disarmingly_simple', 0.5365327000617981)]

In [0]:
def embeddings(data):
  embed = np.zeros((len(data), 300))
  # donne une phrase (doc)
  for index, doc in enumerate(data):
    count = 0
    # pour chaque mot dans le document
    for i, word in enumerate(doc.split()):
      if word in model.vocab:
        embed[index] += model[word]
        count += 1
      #else:
      #  print("\n", word, " not in sentence\n")
    # avoir la moyenne d'embeddings pour le document
    if count == 0:
      # no word found
      continue
    embed[index] /= count
    print('\r Progress %s/%s' % (index, len(data)), end="")
  return embed      

In [21]:
train_word_embed = embeddings(new_train_data)
print("\n", train_word_embed.shape)

 Progress 55999/56000
 (56000, 300)


In [22]:
train_word_embed[0]

array([ 0.06328125,  0.12769775,  0.02001038,  0.11289673, -0.02246075,
        0.10825195,  0.11875   , -0.07305908,  0.04571533,  0.09179688,
        0.00377197, -0.16743164, -0.05092773,  0.01308594, -0.08240356,
        0.00649414, -0.12415771,  0.17851563,  0.02233887, -0.07890625,
       -0.01975098,  0.00690918,  0.06424561, -0.06119385,  0.02331543,
       -0.1355957 , -0.13981934, -0.0055542 ,  0.10153809, -0.08789062,
        0.06328735,  0.13879395, -0.00776367, -0.00031738, -0.11497803,
       -0.04671631,  0.21777344,  0.09018555, -0.00800781,  0.10351562,
        0.08557129, -0.08491211,  0.2140625 , -0.025     ,  0.12358398,
       -0.11191406, -0.0357605 , -0.00273437,  0.11337891,  0.00368805,
       -0.09755859, -0.00600586, -0.07563477, -0.03389893, -0.05596313,
        0.05493164,  0.06135254,  0.01606445,  0.01801147,  0.00406494,
       -0.12299805, -0.05092773,  0.05938721, -0.04364319, -0.09420166,
       -0.07133789,  0.05507812, -0.02626953, -0.08476562,  0.17

In [23]:
print(len(new_test_data))
test_word_embed = embeddings(new_test_data)

14000
 Progress 13999/14000

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier((100, 100,))

clf.fit(train_word_embed, reddit_train_y)

predict = clf.predict(test_word_embed)
score = clf.score(test_word_embed, reddit_test_y)
print(score)



0.3886428571428571


In [27]:
from sklearn.linear_model import SGDClassifier

svm_embeddings = SGDClassifier(loss="hinge", penalty="l2", alpha=1e-3, random_state=42, max_iter=30, tol=None).fit(train_word_embed, reddit_train_y)
predict_svm_embeddings = svm_embeddings.predict(test_word_embed)
score_svm_embeddings = svm_embeddings.score(test_word_embed, reddit_test_y)
print(score_svm_embeddings)

0.3492142857142857
