In [None]:
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from xgboost import XGBClassifier
# import f1_score
from sklearn.metrics import f1_score

#import pmultiprocessing
from multiprocessing import Pool

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/drive/MyDrive/MIE1624_final_project/sentiment_analysis.csv")

## Helper Functions

In [None]:
# Remove punctuations
def remove_punc(s):
  return s.translate(str.maketrans('', '', string.punctuation))

# removes same successive alphabets from words
def remove_repeating_char(text):
    return re.sub(r'(.)\1{1,}', r'\1', text)

# read the file containing stop words and converts it to list
def get_stop_words_from_txt(fname):
    stop_words = []
    with open(fname, 'r', encoding='utf-8') as f:
        for wrd in f.read().splitlines():
            stop_words.append(remove_repeating_char(wrd))
    return stop_words

# removes stop words from text
def clean_stop_words(text, stop_words):
    return " ".join([word for word in str(text).split() if word not in stop_words])

## Function to Clean Data

In [None]:
def clean_data(df_data, add_stop_words= []):

  # Convert text to lower case
  df_data = df_data.str.lower()

  # remove complete urls from the text column
  df_data = df_data.str.replace('https\S+|www.\S+', '', case=False, regex=True)

  # remove all numbers from the text column
  df_data = df_data.str.replace('\d+', '', regex=True)

  # Remove punctuations
  df_data = df_data.apply(lambda x: remove_punc(x))

  # remove successive repeating characters
  df_data = df_data.apply(lambda x: remove_repeating_char(x))

  # read min_stop_words.txt file and add every word to a list
  minimal_stop_words = get_stop_words_from_txt('/content/drive/MyDrive/MIE1624_final_project/minimal_stop_words.txt')

  # if any extra stop words are given add them to minimal_stop_words list
  minimal_stop_words.extend(add_stop_words)

  # remove minimal stop words from text
  df_data = df_data.apply(lambda text: clean_stop_words(text, minimal_stop_words))
  
  return df_data

## Save the cleaned data to an csv

In [None]:
# call the function to clean the data
df['text'] = clean_data(df['text'])

# remove empty rows
df = df.replace(r'^\s*$', np.nan, regex = True)
df.dropna(inplace = True)

# save the cleaned data to a csv file
df.to_csv('/content/drive/MyDrive/MIE1624_final_project/cleaned_data.csv', index=False)

# View the cleaned data
df.head()

Unnamed: 0,ID,text,label
0,7.68098e+17,josh jenkins loking forward tab breders crown ...,1
1,7.68098e+17,mianusmanjaved congratulations pakistan becomi...,1
2,7.68098e+17,pepalerts september yesmag taking maine mendoz...,1
3,7.68098e+17,davidgaibis newly painted wals thanks milion c...,1
4,7.68098e+17,cedricfeschote excited anounce july feschote l...,1
