In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, words
from textblob import TextBlob
import zipfile
import os, re

In [2]:
path = r'C:\Users\Jchukwuedozi\Documents\data science\datasets\deep learning'

In [3]:
with zipfile.ZipFile(os.path.join(path, 'Twitter sentiment (NLP).zip'), 'r') as z:
    with z.open('dataset.csv') as f:
        all_tweets = pd.read_csv(f)

In [4]:
all_tweets.head()

Unnamed: 0,Text,Language,Label
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,en,litigious
1,#BadBunny: Como dos gotas de agua: Joven se di...,es,negative
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,en,litigious
3,Rwanda is set to host the headquarters of Unit...,en,positive
4,OOPS. I typed her name incorrectly (today’s br...,en,litigious


In [5]:
all_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 937854 entries, 0 to 937853
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Text      937854 non-null  object
 1   Language  937831 non-null  object
 2   Label     937854 non-null  object
dtypes: object(3)
memory usage: 21.5+ MB


In [6]:
all_tweets.dropna(inplace=True)

In [7]:
tweets = all_tweets.iloc[:10, :].copy(deep=True)

In [8]:
tweets['Language'] = tweets['Language'].apply(lambda x: x if len(x) < 4 else np.nan)

In [9]:
tweets.dropna(inplace=True)

### Removal of stopwords

In [10]:
tweets['Text'] = tweets['Text'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
words = stopwords.words()
tweets['Text'] = tweets['Text'].apply(lambda x: ' '.join(x for x in x.split() if x not in words))

### Standardizing words

In [11]:
abbreviation_dict = {'ur': 'your', 'sef':'self', 'wbu': 'what about you', 'wat': 'what', 'den':'then',
              'd': 'the', 'lyk': 'like', 'msg': 'message', 'fyi': 'for your information',
               'nyt':'night', 'nvm': 'nevermind', 'swt': 'sweet'}

In [12]:
def standardize_text(sentence):
    '''This is a function created to standardize text, i.e. takeaway abbreviations.
    It accepts a single argument which is the sentence you plan to standardize.
    
    "sentence -- Any sentence used as argument"
    
    '''
    lists = []
    for word in sentence.split():
        if word in abbreviation_dict.keys():
            word = abbreviation_dict[word]
        else:
            word = word
        lists.append(word)
        new_sentence = ' '.join(lists)
    return new_sentence

In [13]:
data = tweets.copy(deep=True)

In [14]:
data['Text'] = data['Text'].apply(standardize_text)

### Correct spelling to avoid having multiple copies of the same word

In [15]:
# data['Text'] = data['Text'].apply(lambda x: str(TextBlob(x).correct()))

In [36]:
def correct_spelling_blob(sentence):
    lists = []
    for word in sentence.split():
        lists.append(str(TextBlob(word).correct()))
        new_sentence = ' '.join(lists)
    return new_sentence

In [17]:
data['Text'] = data['Text'].apply(correct_spelling_blob)