# working with gensim tf-idf model

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

# gensim
import gensim
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score

import optuna
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv('Data/TweetC_train.csv')
test_data = pd.read_csv('Data/TweetC_test.csv')

In [23]:
# dropping indexes with nan

# training set
index_with_nan = train_data.index[train_data.isnull().any(axis=1)]
train_data.drop(index_with_nan,0, inplace=True)

# testing set
index_with_nan = test_data.index[test_data.isnull().any(axis=1)]
test_data.drop(index_with_nan,0, inplace=True)

In [24]:
# we have no nans left
pd.isna(train_data).sum(), pd.isna(test_data).sum()

(Tweet                0
 Sentiment_encoded    0
 dtype: int64, Tweet                0
 Sentiment_encoded    0
 dtype: int64)

In [25]:
# text
X_train = train_data.iloc[:,0].copy().to_numpy()
X_test = test_data.iloc[:,0].copy().to_numpy()

# labels
y_train = train_data.iloc[:,1].copy().to_numpy()
y_test = test_data.iloc[:,1].copy().to_numpy()

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [26]:
X_train.shape, X_test.shape

((41087,), (3795,))

In [27]:
y_train.shape, y_test.shape

((41087,), (3795,))

In [35]:
texts = []
for i in range(0,len(X_train)-1):
    texts.append(X_train[i])

In [30]:
vectorizer = CountVectorizer()
corpus = vectorizer.fit_transform(X_train)
corpus.shape

(41087, 33302)

In [38]:
# merge all tweets in a single list
texts = []
for i in range(0,len(X_train)-1):
    texts.append(X_train[i])

# get the tokens of all sentences
tokenlist = [list(gensim.utils.tokenize(text)) for text in texts]
# generate the dictionary
dct = Dictionary(tokenlist)
# generate the corpus
corpus = [dct.doc2bow(tokens) for tokens in tokenlist]

In [43]:
# store the corpus
gensim.corpora.MmCorpus.serialize('MmCorpusX_train.mm', corpus)

In [44]:
# making the tf-idf model
model = TfidfModel(corpus)

In [46]:
type(model)

gensim.models.tfidfmodel.TfidfModel

In [57]:
vector = model[corpus[100]]
vector

[(22, 0.23691962853679044),
 (36, 0.16736304584915723),
 (63, 0.13725881458476175),
 (74, 0.11683128897030148),
 (79, 0.11064318586811944),
 (96, 0.23447267793894958),
 (518, 0.2757827529686226),
 (527, 0.2233611463320063),
 (553, 0.2526657153078757),
 (581, 0.24205233807916016),
 (607, 0.31252331207397993),
 (822, 0.2689553853641071),
 (839, 0.23572076468202013),
 (881, 0.425938178796318),
 (882, 0.24848568849599065),
 (883, 0.3146960788065133)]

In [58]:
tfidf_matrix = [model[corpus[i]] for i in range(0,len(X_train)-1)]