In [1]:
import pandas as pd

# Load raw CSV (with inconsistent commas)
df_raw = pd.read_csv("amazon_cells_labelled.csv", header=None)

# Function to extract label from any of the extra columns
def extract_label(row):
    for val in row[1:]:
        if str(val).strip() in ['0', '1']:
            return int(val)
    return None

# Step 1: Extract review text and label
df_clean = pd.DataFrame()
df_clean['text'] = df_raw[0]                      # first column is the review
df_clean['label'] = df_raw.apply(extract_label, axis=1)

# Step 2: Drop rows with missing labels
df_clean = df_clean.dropna().reset_index(drop=True)
df_clean['label'] = df_clean['label'].astype(int)

# Step 3 (Optional): Check basic stats
print("Sample data:")
print(df_clean.head())
print("\nLabel distribution:")
print(df_clean['label'].value_counts())

# Step 4: Save cleaned dataset
df_clean.to_csv("cleaned_amazon_sentiment.csv", index=False)

print("\n✅ Cleaned dataset saved as 'cleaned_amazon_sentiment.csv'")


Sample data:
                                                text  label
0  So there is no way for me to plug it in here i...      0
1                                          Good case      1
2                             Great for the jawbone.      1
3  Tied to charger for conversations lasting more...      0
4                                  The mic is great.      1

Label distribution:
label
0    499
1    497
Name: count, dtype: int64

✅ Cleaned dataset saved as 'cleaned_amazon_sentiment.csv'
