# Data Loading

In [8]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('sentiment_tweets3.csv')


Saving sentiment_tweets3.csv to sentiment_tweets3 (1).csv


In [9]:
df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


# Pre-processing

# Case Folding

In [12]:
df['message to examine'][0].lower()
df['message to examine']=df[  'message to examine'].str.lower()
df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat need to send 'em to my accountant tomo...,0
4,540,add me on myspace!!! myspace.com/lookthunder,0


# Removing Tags

In [13]:
import re
def remove_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'',text)

df['message to examine']=df['message to examine'].apply(remove_tags)
df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat need to send 'em to my accountant tomo...,0
4,540,add me on myspace!!! myspace.com/lookthunder,0


# Removing URL

In [14]:
def remove_url(text):
  pattern = re.compile(r'https?://\S+|www\.\S+')
  return pattern.sub(r'',text)

df['message to examine']=df['message to examine'].apply(remove_url)
df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga,0
2,220,@comeagainjen -,0
3,288,@lapcat need to send 'em to my accountant tomo...,0
4,540,add me on myspace!!! myspace.com/lookthunder,0


# Removing Punctuations

In [15]:
import string
def remove_punctuations(text):
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

df['message to examine']=df['message to examine'].apply(remove_punctuations)
df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment i missssssssss him...,0
1,217,is reading manga,0
2,220,comeagainjen,0
3,288,lapcat need to send em to my accountant tomorr...,0
4,540,add me on myspace myspacecomlookthunder,0


# Replace Short Hands

In [16]:
from google.colab import files

uploaded = files.upload()

file_path = 'slang.txt'

data_dict = {}
with open(file_path, 'r') as file:
    for line in file:
        line = line.strip()
        if line and '=' in line:
            key, value = line.split('=', 1)
            data_dict[key] = value

print(data_dict)

def chat_conversion(text):
    new_text =[]
    for c in text.split():
        if c.upper() in data_dict:
            new_text.append(data_dict[c.upper()])
        else:
            new_text.append(c)
    return " ".join(new_text)

Saving slang.txt to slang.txt
{'AFAIK': 'As Far As I Know', 'AFK': 'Away From Keyboard', 'ASAP': 'As Soon As Possible', 'ATK': 'At The Keyboard', 'ATM': 'At The Moment', 'A3': 'Anytime, Anywhere, Anyplace', 'BAK': 'Back At Keyboard', 'BBL': 'Be Back Later', 'BBS': 'Be Back Soon', 'BFN': 'Bye For Now', 'B4N': 'Bye For Now', 'BRB': 'Be Right Back', 'BRT': 'Be Right There', 'BTW': 'By The Way', 'B4': 'Before', 'CU': 'See You', 'CUL8R': 'See You Later', 'CYA': 'See You', 'FAQ': 'Frequently Asked Questions', 'FC': 'Fingers Crossed', 'FWIW': "For What It's Worth", 'FYI': 'For Your Information', 'GAL': 'Get A Life', 'GG': 'Good Game', 'GN': 'Good Night', 'GMTA': 'Great Minds Think Alike', 'GR8': 'Great!', 'G9': 'Genius', 'IC': 'I See', 'ICQ': 'I Seek you (also a chat program)', 'ILU': 'ILU: I Love You', 'IMHO': 'In My Honest/Humble Opinion', 'IMO': 'In My Opinion', 'IOW': 'In Other Words', 'IRL': 'In Real Life', 'KISS': 'Keep It Simple, Stupid', 'LDR': 'Long Distance Relationship', 'LMAO': 'L

In [17]:
df['message to examine']=df['message to examine'].apply(chat_conversion)
df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment i missssssssss him...,0
1,217,is reading manga,0
2,220,comeagainjen,0
3,288,lapcat need to send em to my accountant tomorr...,0
4,540,add me on myspace myspacecomlookthunder,0


# Correct Incorrect Words

In [18]:
!pip install textblob



In [None]:
from textblob import TextBlob

df['message to examine']=df['message to examine'].apply(lambda x: str(TextBlob(x).correct()))
df.head()


# Stop words removal

In [23]:
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
stopwords.words('english')

def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

remove_stopwords(df['message to examine'][6])

'silkcharm  nbn  someone already said  fiber   home mean     least  regular '

# Emoji Removal

# Tokenization

In [32]:
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

print(word_tokenize('Muad Dib learned rapidly because his first training was in how to learn. And the first lesson of all was the basic trust that he could learn. It is shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult."'))
print(sent_tokenize(df['message to examine'][50]))

['Muad', 'Dib', 'learned', 'rapidly', 'because', 'his', 'first', 'training', 'was', 'in', 'how', 'to', 'learn', '.', 'And', 'the', 'first', 'lesson', 'of', 'all', 'was', 'the', 'basic', 'trust', 'that', 'he', 'could', 'learn', '.', 'It', 'is', 'shocking', 'to', 'find', 'how', 'many', 'people', 'do', 'not', 'believe', 'they', 'can', 'learn', ',', 'and', 'how', 'many', 'more', 'believe', 'learning', 'to', 'be', 'difficult', '.', "''"]
['emmalight id forgotten about that we need to do that again sometime']


# Stemming

In [30]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
words =word_tokenize(df['message to examine'][10])
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)

['good', 'morn', 'everybodi']


# Lemmatizing

In [33]:
from nltk.stem import WordNetLemmatizer     #lexical dictionary
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("eggs")
string_for_lemmatizing = df['message to examine'][30]
words
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_words



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['good', 'morning', 'everybody']