Task 2 Data Cleansing and Preprocessing 

In [1]:
# Import required libraries
import pandas as pd

# Load the dataset
file_path = r'C:\Users\JEAN JEASEN\Documents\Codveda\level_1\3) Sentiment dataset.csv'
df = pd.read_csv(file_path)

# Display original data (optional)
print("Original data shape:", df.shape)
df.head()


Original data shape: (732, 15)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [2]:
# Drop the first row (Row A)
df = df.drop(index=0).reset_index(drop=True)

# Rename the first column to 'No'
first_col_name = df.columns[0]
df.rename(columns={first_col_name: 'No'}, inplace=True)

# Reset 'No' column to start from 1
df['No'] = range(1, len(df) + 1)

# Drop 'Unnamed: 0' column if it exists
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

# Display the cleaned DataFrame
print("After Cleaning: ")
df.head()

## Optional: Save cleaned data to new CSV


After Cleaning: 


Unnamed: 0,No,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
1,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
2,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
3,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19
4,5,Feeling grateful for the little things in lif...,Positive,2023-01-16 09:10:00,GratitudeNow,Twitter,#Gratitude #PositiveVibes,25.0,50.0,India,2023,1,16,9


In [3]:
df.describe()

Unnamed: 0,No,Retweets,Likes,Year,Month,Day,Hour
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,21.5171,42.919289,2020.467852,6.129959,15.497948,15.526676
std,211.165812,7.062009,14.091396,2.802639,3.408823,8.480336,4.114164
min,1.0,5.0,10.0,2010.0,1.0,1.0,0.0
25%,183.5,18.0,35.0,2019.0,3.0,9.0,13.0
50%,366.0,22.0,43.0,2021.0,6.0,15.0,16.0
75%,548.5,25.0,50.0,2023.0,9.0,22.0,19.0
max,731.0,40.0,80.0,2023.0,12.0,31.0,23.0


In [5]:
print("Total Duplicated Data: ",df.duplicated().sum())

Total Duplicated Data:  0


In [6]:
df.isna().sum()

No           0
Text         0
Sentiment    0
Timestamp    0
User         0
Platform     0
Hashtags     0
Retweets     0
Likes        0
Country      0
Year         0
Month        0
Day          0
Hour         0
dtype: int64

LABEL ENCODING

In [10]:
print(df["Platform"].unique())
df["Platform"] = df["Platform"].str.strip().str.capitalize()
mapping = {
    "Twitter": 0,
    "Instagram": 1,
    "Facebook": 2  
}

df["Platform_LE"] = df["Platform"].map(mapping)


['Twitter' 'Instagram' 'Facebook']


In [11]:
print(df[df["Platform_LE"].isna()]["Platform"].value_counts())


Series([], Name: count, dtype: int64)


In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["Platform_LE"] = le.fit_transform(df["Platform"])

# To see which label got which code:
print(dict(zip(le.classes_, le.transform(le.classes_))))

{'Facebook': 0, 'Instagram': 1, 'Twitter': 2}


In [13]:
print("After Label Encoding: ")
df.head()

After Label Encoding: 


Unnamed: 0,No,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour,Platform_LE
0,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8,2
1,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15,1
2,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18,0
3,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19,1
4,5,Feeling grateful for the little things in lif...,Positive,2023-01-16 09:10:00,GratitudeNow,Twitter,#Gratitude #PositiveVibes,25.0,50.0,India,2023,1,16,9,2


NORMALIZE NUMERIC VARIABLE

In [14]:
from sklearn.preprocessing import MinMaxScaler

# Columns to normalize
cols_to_normalize = ["Retweets", "Likes"]

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the columns
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

# Optional: Print to verify
print(df[cols_to_normalize].head())


   Retweets     Likes
0  0.000000  0.000000
1  0.428571  0.428571
2  0.085714  0.071429
3  0.200000  0.214286
4  0.571429  0.571429


In [15]:
# save cleaned data set into csv format
df.to_csv("cleaned_sentiment_dataset.csv", index=False)