## CLEANING DATA

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [44]:
#Read the data 
df=pd.read_csv('Apple-Twitter-Sentiment-DFE.csv', encoding='latin1')

In [45]:
df['sentiment'].value_counts(
)

3               2162
1               1219
5                423
not_relevant      82
Name: sentiment, dtype: int64

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3886 entries, 0 to 3885
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   _unit_id              3886 non-null   int64  
 1   _golden               3886 non-null   bool   
 2   _unit_state           3886 non-null   object 
 3   _trusted_judgments    3886 non-null   int64  
 4   _last_judgment_at     3783 non-null   object 
 5   sentiment             3886 non-null   object 
 6   sentiment:confidence  3886 non-null   float64
 7   date                  3886 non-null   object 
 8   id                    3886 non-null   float64
 9   query                 3886 non-null   object 
 10  sentiment_gold        103 non-null    object 
 11  text                  3886 non-null   object 
dtypes: bool(1), float64(2), int64(2), object(7)
memory usage: 337.9+ KB


In [47]:
df.isna().sum()

_unit_id                   0
_golden                    0
_unit_state                0
_trusted_judgments         0
_last_judgment_at        103
sentiment                  0
sentiment:confidence       0
date                       0
id                         0
query                      0
sentiment_gold          3783
text                       0
dtype: int64

In [48]:
df.drop(columns=['_last_judgment_at', 'sentiment_gold'],inplace=True)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3886 entries, 0 to 3885
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   _unit_id              3886 non-null   int64  
 1   _golden               3886 non-null   bool   
 2   _unit_state           3886 non-null   object 
 3   _trusted_judgments    3886 non-null   int64  
 4   sentiment             3886 non-null   object 
 5   sentiment:confidence  3886 non-null   float64
 6   date                  3886 non-null   object 
 7   id                    3886 non-null   float64
 8   query                 3886 non-null   object 
 9   text                  3886 non-null   object 
dtypes: bool(1), float64(2), int64(2), object(5)
memory usage: 277.2+ KB


In [50]:
df_clean=df.copy()

In [51]:

def clean_text(text):
    if isinstance(text, str):
        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)
        # Remove mentions
        text = re.sub(r'@\w+', '', text)
        # Remove hashtags
        text = re.sub(r'#\w+', '', text)
        # Remove RT (retweet) indicator
        text = re.sub(r'^RT\s+', '', text)
        # Remove multiple spaces and strip
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
    return text

In [52]:
# Apply cleaning to the 'text' column
df_clean['text'] = df_clean['text'].apply(clean_text)

In [53]:
df.info

<bound method DataFrame.info of        _unit_id  _golden _unit_state  _trusted_judgments sentiment  \
0     623495513     True      golden                  10         3   
1     623495514     True      golden                  12         3   
2     623495515     True      golden                  10         3   
3     623495516     True      golden                  17         3   
4     623495517    False   finalized                   3         3   
...         ...      ...         ...                 ...       ...   
3881  623499442     True      golden                  13         3   
3882  623499450     True      golden                  16         3   
3883  623499486     True      golden                  14         5   
3884  623499514     True      golden                  13         1   
3885  623517290     True      golden                  17         5   

      sentiment:confidence                            date            id  \
0                   0.6264  Mon Dec 01 19:30:03 +00

In [54]:
print(df.dtypes)

_unit_id                  int64
_golden                    bool
_unit_state              object
_trusted_judgments        int64
sentiment                object
sentiment:confidence    float64
date                     object
id                      float64
query                    object
text                     object
dtype: object


In [55]:
print(df[df.duplicated()])

Empty DataFrame
Columns: [_unit_id, _golden, _unit_state, _trusted_judgments, sentiment, sentiment:confidence, date, id, query, text]
Index: []


In [56]:
print("Before cleaning:")
print(df['text'].head())
print("\
After cleaning:")
print(df_clean['text'].head())

Before cleaning:
0    #AAPL:The 10 best Steve Jobs emails ever...htt...
1    RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2    My cat only chews @apple cords. Such an #Apple...
3    I agree with @jimcramer that the #IndividualIn...
4         Nobody expects the Spanish Inquisition #AAPL
Name: text, dtype: object
After cleaning:
0               :The 10 best Steve Jobs emails ever...
1    : Why AAPL Stock Had a Mini-Flash Crash Today ...
2                   My cat only chews cords. Such an .
3    I agree with that the should own not trade , i...
4               Nobody expects the Spanish Inquisition
Name: text, dtype: object
