In [19]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import re
from tqdm import tqdm
from time import time

# clean
skip it if you have cleaned data

In [2]:
raw_data = pd.read_csv('8k_labeled.csv')

In [3]:
text = raw_data['text']

In [4]:
processed_features = []

for sentence in tqdm(range(0, len(text))):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(text[sentence]))
    # Converting to Lowercase
    processed_feature = processed_feature.lower()
    processed_features.append(processed_feature)

100%|█████████████████████████████████████████████████████████████████████████████| 1213/1213 [00:03<00:00, 308.97it/s]


In [5]:
df_text = pd.DataFrame({'text': processed_features})
df_text.to_csv('processed_text.csv')

# vectorize

In [4]:
text = pd.read_csv('processed_text.csv') # load cleaned data
text['text']

0        0001193125 16 516891 txt   20160324  00011931...
1        0001193125 13 267629 txt   20130621  00011931...
2        0001628280 16 017762 txt   20160726  00016282...
3        0001193125 16 473562 txt   20160223  00011931...
4        0001193125 17 262261 txt   20170818  00011931...
                              ...                        
1208     0001193125 13 104353 txt   20130313  00011931...
1209     0001193125 20 222423 txt   20200817  00011931...
1210     0001193125 13 365311 txt   20130912  00011931...
1211     0001403161 17 000026 txt   20170420  00014031...
1212     0001403161 18 000047 txt   20181017  00014031...
Name: text, Length: 1213, dtype: object

In [5]:
LM_dict = pd.read_csv('LM_dict.csv',keep_default_na=False)

In [6]:
LM_dict.iloc[1]['Word']

'AARDVARKS'

In [7]:
vocab = dict()
for index, row in LM_dict.iterrows():
    word = row['Word'].lower()
    vocab[word] = index

In [10]:
vocab

{'aardvark': 0,
 'aardvarks': 1,
 'abaci': 2,
 'aback': 3,
 'abacus': 4,
 'abacuses': 5,
 'abaft': 6,
 'abalone': 7,
 'abalones': 8,
 'abandon': 9,
 'abandoned': 10,
 'abandoning': 11,
 'abandonment': 12,
 'abandonments': 13,
 'abandons': 14,
 'abase': 15,
 'abased': 16,
 'abasement': 17,
 'abasements': 18,
 'abases': 19,
 'abash': 20,
 'abashed': 21,
 'abashedly': 22,
 'abashes': 23,
 'abashing': 24,
 'abashment': 25,
 'abashments': 26,
 'abasing': 27,
 'abate': 28,
 'abated': 29,
 'abatement': 30,
 'abatements': 31,
 'abates': 32,
 'abating': 33,
 'abattoir': 34,
 'abattoirs': 35,
 'abbe': 36,
 'abbes': 37,
 'abbess': 38,
 'abbesses': 39,
 'abbey': 40,
 'abbeys': 41,
 'abbot': 42,
 'abbots': 43,
 'abbreviate': 44,
 'abbreviated': 45,
 'abbreviates': 46,
 'abbreviating': 47,
 'abbreviation': 48,
 'abbreviations': 49,
 'abdicate': 50,
 'abdicated': 51,
 'abdicates': 52,
 'abdicating': 53,
 'abdication': 54,
 'abdications': 55,
 'abdomen': 56,
 'abdomens': 57,
 'abdominal': 58,
 'abdomi

In [8]:

pipe = Pipeline([('count', CountVectorizer(vocabulary=vocab)),
                 ('tfid', TfidfTransformer())]).fit(text['text'])
# X = pipe['count'].transform(corpus)
# vectorizer.get_feature_names_out()
# print(X.shape)
X = pipe.transform(text['text'])

In [9]:
X.shape

(1213, 86531)

In [10]:
sort_index = np.argsort(X.toarray().sum(axis = 0))# ascending

In [11]:
new_vocab = pipe['count'].get_feature_names_out()[sort_index[-500:]]#take top 500 vocab

# training with new vocab list

In [22]:
vectorizer = TfidfVectorizer()
y = pd.read_csv('8k_labeled.csv')['label']
X_train, X_test, y_train, y_test = train_test_split(text['text'], y, test_size=0.25, random_state=42)

In [23]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [24]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=1).fit(X_train, y_train)

In [25]:
clf.score(X_test, y_test)

0.4243421052631579

In [28]:
clf.score(X_train, y_train)

0.7656765676567657

# Filter in the 500 vocab from text

In [12]:
text = text['text']

In [15]:
vocab_set = set(new_vocab)

In [24]:
temp = []
start = time()
for string in tqdm(text):
    split = string.split()
    filtered = [i for i in split if i in vocab_set]
print(time()-start)

100%|█████████████████████████████████████████████████████████████████████████████| 1213/1213 [00:01<00:00, 865.35it/s]

1.404750108718872



