In [None]:
#Name: JANMEJAY MOHANTY
#Cite: https://necromuralist.github.io/Neurotic-Networking/posts/nlp/implementing-twitter-logistic-regression/

In [31]:
# Importing the libraries
import pandas as pd
import numpy as np
import math

In [32]:
# Reading all given files
tweet_test_data = pd.read_csv('swad_test.csv')
tweet_train_data = pd.read_csv('swad_train.csv')

with open('punctuations.txt', 'r', encoding ='utf-8') as f:
    punctuations_data = [line.strip() for line in f]                       # Storing text file in list format

with open('stopwords.txt', 'r', encoding ='utf-8') as f:
    stopwords_data = [line.strip() for line in f]                          # Storing text file in list format                          

In [33]:
# Converting Label values "Yes" to '1' and "No" to '0'
tweet_train_data['Label_new'] = np.where(tweet_train_data['Label'] == 'Yes', 1, 0)
del tweet_train_data['Label']
tweet_test_data['Label_new'] = np.where(tweet_test_data['Label'] == 'Yes', 1, 0)
del tweet_test_data['Label']

In [34]:
# Converting all tweets into lowercases 
tweet_test_data['Tweet'] = tweet_test_data['Tweet'].str.lower()
tweet_train_data['Tweet'] = tweet_train_data['Tweet'].str.lower()

In [35]:
# Define the function for spacing the punctuation
def spacing_punctuations(text):
    for p in punctuations_data:
        text = text.replace(p,' '+p+' ')
    return text   

In [36]:
# Applying punctuation spacing.
tweet_test_data['Tweet'] = tweet_test_data['Tweet'].apply(spacing_punctuations)
tweet_train_data['Tweet'] = tweet_train_data['Tweet'].apply(spacing_punctuations)

In [37]:
# Removing the '@user'
tweet_test_data['Tweet'] = tweet_test_data['Tweet'].apply(lambda x: x.replace('@ user',''))
tweet_train_data['Tweet'] = tweet_train_data['Tweet'].apply(lambda x: x.replace('@ user',''))

In [38]:
# Removing stopwords from both train and test CSV files
tweet_test_data['Tweet'] = tweet_test_data['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_data)]))
tweet_train_data['Tweet'] = tweet_train_data['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_data)]))

print(tweet_train_data)

        ID                                              Tweet  Label_new
0        0                    bullshit radar full force today          0
1        1             < user > sorry, cut bull shit please .          1
2        2                gay heâ€™s sick pervert gets nerves          1
3        3  great . hereâ€™s today gone : floris wanted sp...          0
4        4  bitch iâ€™m hood serving junkies maybach ! ! !...          0
...    ...                                                ...        ...
3584  3584  2 / 3 tv claiming journalist kagan sitting & a...          0
3585  3585  stfu . ' survive 13 seasons hell hospital shit...          0
3586  3586  hell location time shooting got anything ? peo...          0
3587  3587  isnâ€™t considered hate speech liberals . repu...          0
3588  3588  itâ€™d better monday hell slow walk dictate te...          0

[3589 rows x 3 columns]


In [39]:
#td = tweet_train_data[tweet_train_data['Label_new'] == 1].reset_index(drop=True)
td = tweet_train_data
converted_list=[]
for element in td['Tweet']:
    converted_list.append(element.strip())
string = ' '.join([str(item) for item in converted_list])

word_set =  string.split(' ')
uniquewords = set(word_set)
#print(uniquewords)

In [40]:
now = dict.fromkeys(uniquewords, 0)                            #now here represents the number of words 
for x in word_set:
    now[x] += 1
#print(now)    

In [41]:
# Created the TF function
def TF(NoOfWords, SetofWords):
    tfDict = {}
    SetofWordsCount = len(SetofWords)
    for word, count in NoOfWords.items():
        tfDict[word] = count / float(SetofWordsCount)
    return tfDict    

In [42]:
# Calculating the TF Values
tf = TF(now, word_set)

# Displaying TF Values
tf

{'concerned': 4.0817992571125354e-05,
 'arrogantly': 4.0817992571125354e-05,
 'win': 0.0001836809665700641,
 'lmfaoo': 4.0817992571125354e-05,
 'rewards': 2.0408996285562677e-05,
 'defending': 4.0817992571125354e-05,
 '44': 2.0408996285562677e-05,
 'indicators': 4.0817992571125354e-05,
 'vampire': 2.0408996285562677e-05,
 'right,': 2.0408996285562677e-05,
 'rocks': 6.122698885668803e-05,
 'tweeted': 2.0408996285562677e-05,
 'sin': 2.0408996285562677e-05,
 'bearded': 4.0817992571125354e-05,
 'disrespect': 4.0817992571125354e-05,
 'portion': 4.0817992571125354e-05,
 'thatâ€™s': 0.0013674027511326992,
 'chop': 6.122698885668803e-05,
 'weapons': 4.0817992571125354e-05,
 'i,': 2.0408996285562677e-05,
 'magic': 6.122698885668803e-05,
 'spam': 0.00012245397771337607,
 'lot,': 4.0817992571125354e-05,
 'hr': 2.0408996285562677e-05,
 'ðÿ’\x81ðÿ\x8f½ðÿ˜‚': 4.0817992571125354e-05,
 'credibility': 4.0817992571125354e-05,
 'tonne': 4.0817992571125354e-05,
 'car': 0.00020408996285562676,
 'fed': 2.04

In [43]:
# Created the IDF function
def IDF(nwords, unique_words):
  idf_dict={}
  N=len(nwords)
  for i in unique_words:
    count=0
    for nw in nwords:
      if i in nw.split():
        count=count+1
      idf_dict[i]=(math.log((1+N)/(count+1)))+1
  return idf_dict       

In [44]:
# Calculating the IDF Values
idfs = IDF(now, uniquewords)

In [45]:
# Created the TF-IDF function
def TFIDF(tfn, idf):
    tfidf = {}
    for word, val in tfn.items():
        tfidf[word] = val* idf[word]
    return tfidf    

In [46]:
# Calculating the TF-IDF Values
tfidfs = TFIDF(tf, idfs)

In [47]:
# Displaying TF-IDF Values
tfidfs

{'concerned': 0.00037132625539486146,
 'arrogantly': 0.00037132625539486146,
 'win': 0.0016709681492768766,
 'lmfaoo': 0.00037132625539486146,
 'rewards': 0.00018566312769743073,
 'defending': 0.00037132625539486146,
 '44': 0.00018566312769743073,
 'indicators': 0.00037132625539486146,
 'vampire': 0.00018566312769743073,
 'right,': 0.00018566312769743073,
 'rocks': 0.0005569893830922922,
 'tweeted': 0.00018566312769743073,
 'sin': 0.00018566312769743073,
 'bearded': 0.00037132625539486146,
 'disrespect': 0.00037132625539486146,
 'portion': 0.00037132625539486146,
 'thatâ€™s': 0.012439429555727858,
 'chop': 0.0005569893830922922,
 'weapons': 0.00037132625539486146,
 'i,': 0.00018566312769743073,
 'magic': 0.0005569893830922922,
 'spam': 0.0011139787661845844,
 'lot,': 0.00037132625539486146,
 'hr': 0.00018566312769743073,
 'ðÿ’\x81ðÿ\x8f½ðÿ˜‚': 0.00037132625539486146,
 'credibility': 0.00037132625539486146,
 'tonne': 0.00037132625539486146,
 'car': 0.0018566312769743073,
 'fed': 0.00018

In [48]:
# Creating dataframe of TFIDF
tfidf_df = pd.DataFrame(tfidfs.items(), columns= ['Words', 'TF-IDF'])
tfidf_df

Unnamed: 0,Words,TF-IDF
0,concerned,0.000371
1,arrogantly,0.000371
2,win,0.001671
3,lmfaoo,0.000371
4,rewards,0.000186
...,...,...
6564,commies,0.000743
6565,ðÿ‘‘ðÿ‘‘ðÿ‘‘ðÿ‘‘,0.000743
6566,stab,0.000743
6567,opinions,0.001485


In [49]:
X_train, y_train = tweet_test_data['Label_new'], tfidf_df['TF-IDF']
X_test = tweet_test_data['Tweet']

y_train.info()

<class 'pandas.core.series.Series'>
RangeIndex: 6569 entries, 0 to 6568
Series name: TF-IDF
Non-Null Count  Dtype  
--------------  -----  
6569 non-null   float64
dtypes: float64(1)
memory usage: 51.4 KB


In [50]:
class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Algorithm of Gradient Descent
        for _ in range(self.n_iters):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self._sigmoid(linear_model)
            W = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            B = (1 / n_samples) * np.sum(y_predicted - y)
            self.weights -= self.lr * W
            self.bias -= self.lr * B

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    # Calculating the Square_loss
    def square_loss(y_pred, target):
        return np.mean(np.power((y_pred - target),2))    

In [51]:
def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

In [52]:
regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000)
regressor.fit(X_train, y_train)

ValueError: not enough values to unpack (expected 2, got 1)