In [133]:
'''
Findings:
    Methods:
    1. BOG unigram
    2. BOG bigram
    3. TF-IDF
    
    Overall, these methods create too many features
    
    Possible solution: Dimentionality reduction
    1. BOG unigram using LatentDirichletAllocation
    2. BOG unigram using TruncatedSVD
    
    TODO: 
    1. Different dimensionality reduction methods
    2. Cannot drop stop words since they are hashed. But we can drop K common words.
    3. Try bigram or TF-IDF with dimensionality reduction.
    4. Try feature selection rather than dimensionality reduction. (option: SGDClassifier)
    5. Any paramater tuning.

TODO: 
    - word2vec? We would have to train new weights based on corpus
    
'''

'\nFindings:\n    Methods:\n    1. BOG unigram\n    2. BOG bigram\n    3. TF-IDF\n    \n    Overall, these methods create many features\n    \n    Possible solution: Dimentionality reduction\n    1. BOG unigram using LatentDirichletAllocation\n    \n\nTODO: \n    - word2vec? We would have to train new weights based on corpus\n    \n\n\n\n'

In [134]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD


In [135]:
# Load CSV to Dataframe
PATH = '../data/'
FILE_TRAIN = 'XYtr.csv'
FILE_TEST = 'Xte.csv'
df_train = pd.read_csv(PATH + FILE_TRAIN)
df_test = pd.read_csv(PATH + FILE_TEST)
# Fill empty descriptions with a unique 'empty' token.
description_train = df_train['description']
description_train = description_train.fillna('NAN')
description_test = df_test['description']
description_test = description_test.fillna('NAN')

In [136]:
# Create the corpus using the training and test data
corpus = list(description_train)+list(description_test)
vectorizer = CountVectorizer()
corpus = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()

# Create the features for the seperate train and test sets
sequences = list(description_train)
sequences_train = vectorizer.transform(sequences).toarray()
word_features_train = pd.DataFrame(sequences_train, columns=features)
print(word_features_train.shape)

sequences = list(description_test)
sequences_test = vectorizer.transform(sequences).toarray()
word_features_test = pd.DataFrame(sequences_test, columns=features)
print(word_features_test.shape)


(6914, 14381)
(6914, 14381)


In [137]:
# Using bigrams (creates many more features). (2, 2) means ONLY bigrams
corpus = list(description_train)+list(description_test)
vectorizer = CountVectorizer(ngram_range=(2, 2))
vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()
print(features)

# Create the features for the seperate train and test sets
sequences = list(description_train)
sequences_train = vectorizer.transform(sequences).toarray()
word_features_train = pd.DataFrame(sequences_train, columns=features)
print(word_features_train.shape)

['002n7 5o5hz' '002n7 gq5ct' '002n7 rkasv' ... 'zzw3j lbjbk' 'zzw3j zbbrf'
 'zzymt cdzuk']
(6914, 78508)


In [138]:
# TF-IDF
# Term Frequency: This summarizes how often a given word appears within a document.
# Inverse Document Frequency: This downscales words that appear a lot across documents.
corpus = list(description_train)+list(description_test)
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
features = vectorizer.get_feature_names_out()

sequences = list(description_train)
sequences_train = vectorizer.transform(sequences).toarray()
word_features_train = pd.DataFrame(sequences_train, columns=features)
print(vector.shape)

(6914, 14381)


In [139]:
# LDA from lecture (dimensionality reduction)
K = 10
corpus = list(description_train)+list(description_test)
vectorizer = CountVectorizer()
corpus = vectorizer.fit_transform(corpus)
lda = LatentDirichletAllocation(n_components = K)
lda.fit(corpus)
topics = lda.transform(corpus)

print(topics.shape)

(13828, 10)


In [142]:
# TruncatedSVD (dimensionality reduction)
# Contrary to PCA, this estimator does not center the data before computing the singular value decomposition.
K = 10
corpus = list(description_train)+list(description_test)
vectorizer = CountVectorizer()
corpus = vectorizer.fit_transform(corpus)

svd = TruncatedSVD(n_components=K, n_iter=7, random_state=42)
svd.fit(corpus)

# Total variance explained.
print(svd.explained_variance_ratio_.sum())

0.5762409908464238


In [141]:
'''
Rough work
'''
# # Create the corpus and bag of words.
# corpus = description.values.tolist()
# vectorizer = CountVectorizer()
# vectorizer.fit(corpus)
# # Each word is a an index of a vector of size .
# # print(vectorizer.vocabulary_)

# #Create the bag of words.
# bag_of_words = vectorizer.transform(corpus)
# # (sequence number, index assigned to word from fit) -> count
# # print(bag_of_words)

# # We can see that for sequence 0, the index 627 (representing word 627) is count 1
# # print('bag of words as an array:\n{}'.format(bag_of_words.toarray()[0][627]))


'\nRough work\n'