# Importing data

In [69]:
#general purpose packages
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

#data processing
import re, string
import emoji
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

#keras
import tensorflow as tf
from tensorflow import keras

#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

#set seed for reproducibility
seed=42

In [70]:
sentiment = pd.read_csv("E:/Coding/sem5/NLP/Sentiment1.csv", encoding='latin1')
train = pd.read_csv("E:/Coding/sem5/NLP/Train1.csv", encoding='latin1')

In [71]:
sentiment.head()

Unnamed: 0,Date,Text,Username,Length_Text,Sentiment
0,Wed Dec 13 23:47:11 +0000 2023,@Hasbil_Lbs @aniesbaswedan gampang sih bikin v...,DzulfiqorParisi,254,Negative
1,Wed Dec 13 23:46:34 +0000 2023,"Lagi, lagi dan terus Tokoh Harapan yg dirinduk...",sumadiseloguno,222,Positive
2,Wed Dec 13 23:43:39 +0000 2023,Biarpun BuzzeRp dikerahkan utk trs bela junjun...,AlfathMelfas,281,Negative
3,Wed Dec 13 23:36:50 +0000 2023,Kagum dgn senyum pak Anies yg teteup tersunggi...,SBahriTweet,216,Positive
4,Wed Dec 13 23:36:04 +0000 2023,Gemoy sih tapiiii..udah TUAAAAA..????...jadi s...,samask_1,161,Neutral


In [72]:
sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1336 entries, 0 to 1335
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         1336 non-null   object
 1   Text         1336 non-null   object
 2   Username     1336 non-null   object
 3   Length_Text  1336 non-null   int64 
 4   Sentiment    1336 non-null   object
dtypes: int64(1), object(4)
memory usage: 52.3+ KB


In [73]:
train.head()

Unnamed: 0,Date,Text,Username,Length_Text,Sentiment
0,Wed Dec 13 17:59:27 +0000 2023,Poling Diadakan Oleh Kader PSI Dan Yang Menang...,msw_andi,137,Positive
1,Wed Dec 13 17:48:49 +0000 2023,Unggul Telak Dalam Debat Capres Anies Basweda...,msw_andi,231,Positive
2,Wed Dec 13 17:32:11 +0000 2023,Sihlakan Retweet bagi yang dukung @aniesbaswed...,NafisahKH2022,165,Positive
3,Wed Dec 13 16:50:05 +0000 2023,@DPP_PKB @aniesbaswedan @cakimiNOW good job pa...,pikiranlugu,227,Positive
4,Mon Dec 18 01:40:53 +0000 2023,@Fahrihamzah Wakanda No More ? Indonesia Forev...,RakhaBilly6,128,Neutral


In [74]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1336 entries, 0 to 1335
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         1336 non-null   object
 1   Text         1336 non-null   object
 2   Username     1336 non-null   object
 3   Length_Text  1336 non-null   int64 
 4   Sentiment    1336 non-null   object
dtypes: int64(1), object(4)
memory usage: 52.3+ KB


# Preprocessing data

## Redundant data handling

In [75]:
sentiment = sentiment.drop(labels=['Date', 'Username'], axis=1)
train = train.drop(labels=['Date', 'Username'], axis=1)

## Duplicate values handling

In [76]:
sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1336 entries, 0 to 1335
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text         1336 non-null   object
 1   Length_Text  1336 non-null   int64 
 2   Sentiment    1336 non-null   object
dtypes: int64(1), object(2)
memory usage: 31.4+ KB


In [77]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1336 entries, 0 to 1335
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text         1336 non-null   object
 1   Length_Text  1336 non-null   int64 
 2   Sentiment    1336 non-null   object
dtypes: int64(1), object(2)
memory usage: 31.4+ KB


In [78]:
sentiment.drop_duplicates(subset='Text',inplace=True)
train.drop_duplicates(subset='Text',inplace=True)

In [79]:
sentiment.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1327 entries, 0 to 1335
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text         1327 non-null   object
 1   Length_Text  1327 non-null   int64 
 2   Sentiment    1327 non-null   object
dtypes: int64(1), object(2)
memory usage: 41.5+ KB


In [80]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1327 entries, 0 to 1335
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text         1327 non-null   object
 1   Length_Text  1327 non-null   int64 
 2   Sentiment    1327 non-null   object
dtypes: int64(1), object(2)
memory usage: 41.5+ KB


## Text deep cleaning

In [81]:
def Strip_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" 
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF" 
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002500-\U00002BEF" 
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u200d"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\u3030"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def Strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub(r'[^\x00-\x7f]',r'', text) 
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

def Clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet))
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) 
    return new_tweet2

def Filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def Remove_mult_spaces(text):
    return re.sub("\s\s+" , " ", text)

def Text_deep_cleaning(text):
    return Remove_mult_spaces(Filter_chars(Clean_hashtags(Strip_all_entities(Strip_emoji(text)))))

  new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet))
  return re.sub("\s\s+" , " ", text)


In [82]:
cleaned_text = []
for text in sentiment.Text:
    cleaned_text.append(Text_deep_cleaning(text))
    
sentiment['Text'] = cleaned_text

cleaned_text = []
for text in train.Text:
    cleaned_text.append(Text_deep_cleaning(text))
    
train['Text'] = cleaned_text

In [83]:
cleaned_text = []
for text in sentiment.Text:
    cleaned_text.append(len(text.split()))
    
sentiment['Length_Text'] = cleaned_text

cleaned_text = []
for text in train.Text:
    cleaned_text.append(len(text.split()))
    
train['Length_Text'] = cleaned_text

In [84]:
sentiment.head()

Unnamed: 0,Text,Length_Text,Sentiment
0,gampang sih bikin video kya gt konsepnya gt ga...,28,Negative
1,lagi lagi dan terus tokoh harapan yg dirinduka...,23,Positive
2,biarpun buzzerp dikerahkan utk trs bela junjun...,28,Negative
3,kagum dgn senyum pak anies yg teteup tersunggi...,27,Positive
4,gemoy sih tapiiiiudah tuaaaaajadi sering lupa ...,10,Neutral


In [85]:
train.head()

Unnamed: 0,Text,Length_Text,Sentiment
0,poling diadakan oleh kader psi dan yang menang...,11,Positive
1,unggul telak dalam debat capres anies baswedan...,20,Positive
2,sihlakan retweet bagi yang dukung amp aniesmuh...,7,Positive
3,good job pak anies baswedan bersama pak muhaim...,22,Positive
4,wakanda no more indonesia forever aniesmuhaimi...,6,Neutral


## Tokenizer

In [86]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [87]:
token_len = []

for text in sentiment.Text:
    token = tokenizer.encode(text, truncation=True)
    token_len.append(len(token))

sentiment['token_len'] = token_len

token_len = []

for text in train.Text:
    token = tokenizer.encode(text, truncation=True)
    token_len.append(len(token))

train['token_len'] = token_len

In [88]:
sentiment.head()

Unnamed: 0,Text,Length_Text,Sentiment,token_len
0,gampang sih bikin video kya gt konsepnya gt ga...,28,Negative,68
1,lagi lagi dan terus tokoh harapan yg dirinduka...,23,Positive,66
2,biarpun buzzerp dikerahkan utk trs bela junjun...,28,Negative,84
3,kagum dgn senyum pak anies yg teteup tersunggi...,27,Positive,62
4,gemoy sih tapiiiiudah tuaaaaajadi sering lupa ...,10,Neutral,33


In [89]:
train.head()

Unnamed: 0,Text,Length_Text,Sentiment,token_len
0,poling diadakan oleh kader psi dan yang menang...,11,Positive,28
1,unggul telak dalam debat capres anies baswedan...,20,Positive,62
2,sihlakan retweet bagi yang dukung amp aniesmuh...,7,Positive,24
3,good job pak anies baswedan bersama pak muhaim...,22,Positive,54
4,wakanda no more indonesia forever aniesmuhaimi...,6,Neutral,16


#