In [1]:
# Import libraries
import pandas as pd
import zipfile
import os

# ========== STEP 1: Extract Dataset ==========
# Path to your zip file (change if needed)
zip_path = r"C:\Users\DELL\Downloads\archive (1).zip"
extract_path = r"C:\Users\DELL\Downloads\sentiment_data"

# Extract the dataset
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Load the dataset
file_path = os.path.join(extract_path, 'training.1600000.processed.noemoticon.csv')
df = pd.read_csv(file_path, encoding='latin-1', header=None)

# Add column names (as per dataset description on Kaggle)
df.columns = ["target", "ids", "date", "flag", "user", "text"]

# ========== STEP 2: BEFORE CLEANING INFO ==========
df_info_before = {
    "Shape": df.shape,
    "Missing Values": df.isnull().sum().to_dict(),
    "Duplicates": df.duplicated().sum()
}

print("===== BEFORE CLEANING =====")
print("Shape:", df_info_before["Shape"])
print("Missing Values:", df_info_before["Missing Values"])
print("Duplicates:", df_info_before["Duplicates"])


# ========== STEP 3: DATA CLEANING ==========

# 1. Remove duplicates
df_cleaned = df.drop_duplicates()

# 2. Handle missing values
df_cleaned = df_cleaned.dropna()

# 3. Treat outliers in 'text' column
# Keep only tweets with length between 3 and 280 characters
df_cleaned = df_cleaned[df_cleaned['text'].str.len().between(3, 280)]


# ========== STEP 4: AFTER CLEANING INFO ==========
df_info_after = {
    "Shape": df_cleaned.shape,
    "Missing Values": df_cleaned.isnull().sum().to_dict(),
    "Duplicates": df_cleaned.duplicated().sum()
}

print("\n===== AFTER CLEANING =====")
print("Shape:", df_info_after["Shape"])
print("Missing Values:", df_info_after["Missing Values"])
print("Duplicates:", df_info_after["Duplicates"])


# ========== STEP 5: BEFORE vs AFTER REPORT ==========
report_data = {
    "Aspect": ["Shape (rows, cols)", "Missing Values", "Duplicates"],
    "Before Cleaning": [
        str(df_info_before["Shape"]),
        str(df_info_before["Missing Values"]),
        df_info_before["Duplicates"]
    ],
    "After Cleaning": [
        str(df_info_after["Shape"]),
        str(df_info_after["Missing Values"]),
        df_info_after["Duplicates"]
    ]
}

report_df = pd.DataFrame(report_data)
print("\n===== BEFORE vs AFTER CLEANING REPORT =====")
print(report_df.to_string(index=False))

# Optionally save the cleaned dataset
df_cleaned.to_csv(r"C:\Users\DELL\Downloads\twitter_sentiment_cleaned.csv", index=False)


===== BEFORE CLEANING =====
Shape: (1600000, 6)
Missing Values: {'target': 0, 'ids': 0, 'date': 0, 'flag': 0, 'user': 0, 'text': 0}
Duplicates: 0

===== AFTER CLEANING =====
Shape: (1599982, 6)
Missing Values: {'target': 0, 'ids': 0, 'date': 0, 'flag': 0, 'user': 0, 'text': 0}
Duplicates: 0

===== BEFORE vs AFTER CLEANING REPORT =====
            Aspect                                                     Before Cleaning                                                      After Cleaning
Shape (rows, cols)                                                        (1600000, 6)                                                        (1599982, 6)
    Missing Values {'target': 0, 'ids': 0, 'date': 0, 'flag': 0, 'user': 0, 'text': 0} {'target': 0, 'ids': 0, 'date': 0, 'flag': 0, 'user': 0, 'text': 0}
        Duplicates                                                                   0                                                                   0
