In [None]:
# 📦 Install contractions library (only needs to be done once per session)
!pip install contractions

import pandas as pd
import numpy as np
import re
import string
import contractions

# ✅ Step 1: Load the dataset from Colab file system
df = pd.read_csv(
    "/content/training_data.csv",
    encoding='latin-1',
    header=None,
    on_bad_lines='skip',
    engine='python'  # 👈 more robust parsing
)

df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# ✅ Step 2: Keep only 0 (negative) and 4 (positive) labels, convert 4 → 1
df = df[df['target'].isin([0, 4])]
df['target'] = df['target'].replace({4: 1})

# ✅ Step 3: Define characters to KEEP (keep ! and ? for emotion)
KEEP_CHARS = "!?"

# Function to remove punctuation except for ! and ?
def remove_unnecessary_punctuation(text):
    unwanted = string.punctuation.translate(str.maketrans('', '', KEEP_CHARS))
    return text.translate(str.maketrans('', '', unwanted))

# ✅ Step 4: Preprocess tweets
def preprocess_text(text):
    text = text.lower()
    text = contractions.fix(text)  # expand contractions
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r"http\S+|www.\S+", "", text)  # remove URLs
    text = re.sub(r'"', '', text)  # remove double quotes
    text = remove_unnecessary_punctuation(text)
    text = re.sub(r'\s+', ' ', text).strip()  # normalize whitespace
    return text

# Apply preprocessing to text
df['text'] = df['text'].astype(str).apply(preprocess_text)

# ✅ Step 5: Retain only necessary columns
df = df[['text', 'target']]

# ✅ Step 6: Shuffle and split into 3 clients and 1 test set
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
split_dfs = np.array_split(df, 4)

# ✅ Step 7: Save the splits
split_dfs[0].to_csv("/content/client_1_data.csv", index=False)
split_dfs[1].to_csv("/content/client_2_data.csv", index=False)
split_dfs[2].to_csv("/content/client_3_data.csv", index=False)
split_dfs[3].to_csv("/content/test_data.csv", index=False)

print("✅ Files saved: client_1_data.csv, client_2_data.csv, client_3_data.csv, test_data.csv")


In [4]:
from google.colab import files
files.download("/content/client_1_data.csv")
files.download("/content/client_2_data.csv")
files.download("/content/client_3_data.csv")
files.download("/content/test_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>