In [None]:
import csv
import os
# Converting my .txt training data to .csv for easier handling

# Ensure the Data folder exists
os.makedirs("Data", exist_ok=True)

# Input and output file paths
input_file = "Data/train.txt"
output_file = "Data/train.csv"

valid_rows = 0
skipped_rows = 0

with open(input_file, 'r', encoding='latin-1') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    # Use csv.writer with quoting for safety
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["TonyID", "Sentiment1", "SentimentText"])  # enforce headers

    # Skip the header line from the .txt file
    header = next(infile, None)

    for line in infile:
        line = line.strip()
        if not line:
            skipped_rows += 1
            continue  # skip blank lines

        # Split only the first two columns, keep the rest (the text) as one field
        parts = line.split('\t', 2)
        if len(parts) < 3:
            parts = line.split(None, 2)  # fallback for space-separated data

        if len(parts) != 3:
            skipped_rows += 1
            continue  # skip malformed rows

        tony_id, sentiment, text = [p.strip() for p in parts]

        # Skip if any field is empty or null-like
        if not tony_id or not sentiment or not text:
            skipped_rows += 1
            continue

        writer.writerow([tony_id, sentiment, text])
        valid_rows += 1

print(f"Conversion complete! '{output_file}' created successfully.")
print(f"{valid_rows} valid rows written, {skipped_rows} rows skipped.")


In [None]:
import csv
import os
# Converting my .txt testing data to .csv for easier handling

# Ensure the Data folder exists
os.makedirs("Data", exist_ok=True)

# Input and output file paths
input_file = "Data/test2_public.txt"
output_file = "Data/test2.csv"

valid_rows = 0
skipped_rows = 0

with open(input_file, 'r', encoding='latin-1') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    # Use csv.writer with quoting for safety
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["TonyID", "SentimentText"])  # enforce headers

    # Skip the header line from the .txt file
    header = next(infile, None)

    for line in infile:
        line = line.strip()
        if not line:
            skipped_rows += 1
            continue  # skip blank lines

        # Split only the first two columns, keep the rest (the text) as one field
        parts = line.split('\t', 1)
        if len(parts) < 2:
            parts = line.split(None, 2)  # fallback for space-separated data

        if len(parts) != 2:
            skipped_rows += 1
            continue  # skip malformed rows

        tony_id, text = [p.strip() for p in parts]

        # Skip if any field is empty or null-like
        if not tony_id or not text:
            skipped_rows += 1
            continue

        writer.writerow([tony_id, text])
        valid_rows += 1

print(f"Conversion complete! '{output_file}' created successfully.")
print(f"{valid_rows} valid rows written, {skipped_rows} rows skipped.")


In [None]:
#Combine test1 and test2 into a single test file
import pandas as pd

# Read both CSV files
df1 = pd.read_csv('Data/test1.csv')
df2 = pd.read_csv('Data/test2.csv')

# Combine them vertically (stacking rows)
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save to a new CSV file
combined_df.to_csv('Data/test.csv', index=False)

print(f"Combined CSV created successfully!")
print(f"Total rows: {len(combined_df)}")
print(f"Columns: {list(combined_df.columns)}")
print("\nFirst few rows:")
print(combined_df.head())
