In [2]:
# Import standard python libraries
import numpy as np
import pandas as pd

# Import data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import necessary libraries
import nltk
import re
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer


import warnings
warnings.filterwarnings('ignore')

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Developer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Developer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load dataset

In [8]:
df = pd.read_csv('../src/data/train.csv')

In [9]:
df.head()

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   count                     24783 non-null  int64 
 1   hate_speech_count         24783 non-null  int64 
 2   offensive_language_count  24783 non-null  int64 
 3   neither_count             24783 non-null  int64 
 4   class                     24783 non-null  int64 
 5   tweet                     24783 non-null  object
dtypes: int64(5), object(1)
memory usage: 1.1+ MB


In [11]:
df.duplicated().sum()

0

In [12]:
def clean_tweet(tweet):
    # Remove "#" and "@" symbols
    tweet = re.sub(r'[#@]', '', tweet)
    
    # Remove ":"
    tweet = re.sub(r':', '', tweet)
    
    # Remove URLs
    tweet = re.sub(r'https?://[^\s]+', 'url', tweet)

    # Remove special characters
    tweet = re.sub(r'[^\w\s]', '', tweet)

    # Remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    
    # Remove newline and carriage return characters
    tweet = re.sub(r'[\n\r]', '', tweet)
    
    # Remove 'RT'
    tweet = re.sub(r'\bRT\b', '', tweet, flags=re.IGNORECASE)
    
    # Remove accented characters
    tweet = unicodedata.normalize('NFKD', tweet).encode('ASCII', 'ignore').decode('utf-8')

    # Remove currency symbols
    tweet = re.sub(r'[\u00A2-\u00A5\u20A0-\u20B9]+', '', tweet)
    
    # Tokenization
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    tokens = tokenizer.tokenize(tweet)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
     # Remove single letter
    tokens = [token for token in tokens if len(token) > 1]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Rejoin tokens into a string
    cleaned_tweet = ' '.join(tokens)

    return cleaned_tweet

In [13]:
df['clean_tweet'] = df['tweet'].apply(clean_tweet)

In [14]:
df

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet,clean_tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,mayasolovely woman shouldnt complain cleaning ...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,mleew boy dat coldtyga dwn bad cuffin dat hoe ...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,urkindofbrand dawg sbabylife ever fuck bitch s...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,c_g_anderson viva_based look like tranny
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shenikaroberts shit hear might true might fake...
...,...,...,...,...,...,...,...
24778,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,yous muthafin lie lifeasking _pearls corey_ema...
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, an...",youve gone broke wrong heart baby drove rednec...
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like aint fuck...
24781,6,0,6,0,1,youu got wild bitches tellin you lies,youu got wild bitch tellin lie


In [15]:
df.to_csv('../src/data/train_clean.csv', index=False)