In [1]:
import pandas as pd
import os

def csv_to_text_files(csv_file, output_dir):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file, on_bad_lines='skip')

    # Loop through each row in the DataFrame
    for index, row in df.iterrows():
        if all(pd.to_numeric(row, errors='coerce').notna()):
            # Skip if all elements are numeric
            print(f"Skipping row {index + 1} because it's all numbers.")
            continue

        # Create a new text file for each row
        text_file_path = os.path.join(output_dir, f"neutral_{index + 1}.txt")

        # Write the contents of the row to the text file
        with open(text_file_path, 'w', encoding='utf-8') as text_file:
            text_file.write('\n'.join(row.astype(str)))  # Convert each value to string and join with new lines

        print(f"Created: {text_file_path}")

# Example usage
csv_file_path = 'all_csv/neutral_v2.csv'  # Path to your CSV file
output_directory = 'data_v1/neutral'  # Directory where text files will be saved

csv_to_text_files(csv_file_path, output_directory)

Created: data_v1/neutral/neutral_1.txt
Created: data_v1/neutral/neutral_2.txt
Skipping row 3 because it's all numbers.
Created: data_v1/neutral/neutral_4.txt
Created: data_v1/neutral/neutral_5.txt
Skipping row 6 because it's all numbers.
Created: data_v1/neutral/neutral_7.txt
Created: data_v1/neutral/neutral_8.txt
Skipping row 9 because it's all numbers.
Created: data_v1/neutral/neutral_10.txt
Created: data_v1/neutral/neutral_11.txt
Skipping row 12 because it's all numbers.
Created: data_v1/neutral/neutral_13.txt
Created: data_v1/neutral/neutral_14.txt
Skipping row 15 because it's all numbers.
Created: data_v1/neutral/neutral_16.txt
Created: data_v1/neutral/neutral_17.txt
Skipping row 18 because it's all numbers.
Created: data_v1/neutral/neutral_19.txt
Created: data_v1/neutral/neutral_20.txt
Skipping row 21 because it's all numbers.
Created: data_v1/neutral/neutral_22.txt
Created: data_v1/neutral/neutral_23.txt
Skipping row 24 because it's all numbers.
Created: data_v1/neutral/neutral_

In [1]:
import csv

# Input and output file paths
input_file = 'all_csv/training.1600000.processed.noemoticon.csv'
output_file = 'all_csv/neutral_v2.csv'

# Open the input CSV file
with open(input_file, mode='r') as infile:
    reader = csv.reader(infile)
    
    # Extract the last column from each row
    last_column_data = [row[-1] for row in reader]

# Write the extracted column to the output CSV file
with open(output_file, mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    
    # Write each entry of the last column as a new row
    for item in last_column_data:
        writer.writerow([item])

print(f"The last column has been successfully written to {output_file}")


The last column has been successfully written to all_csv/neutral_v2.csv


In [3]:
import re

# Input and output file paths
input_file = 'all_csv/neutral_v2.csv'
output_file = 'all_csv/neutral_v3.csv'

# Read the content of the CSV file
with open(input_file, 'r') as file:
    content = file.read()

# Remove all quotation marks using regex
cleaned_content = re.sub(r'["\']', '', content)

# Write the cleaned content to the output file
with open(output_file, 'w') as file:
    file.write(cleaned_content)

print(f"Quotation marks have been removed and saved to {output_file}")

Quotation marks have been removed and saved to all_csv/neutral_v3.csv


In [4]:
import re

# Input and output file paths
input_file = 'all_csv/neutral_v3.csv'
output_file = 'all_csv/neutral_v4.csv'

# Read the content of the CSV file
with open(input_file, 'r') as file:
    content = file.read()

# Remove all words that start with @ using regex
cleaned_content = re.sub(r'@\w+', '', content)

# Write the cleaned content to the output file
with open(output_file, 'w') as file:
    file.write(cleaned_content)

print(f"Words starting with @ have been removed and saved to {output_file}")

Words starting with @ have been removed and saved to all_csv/neutral_v4.csv


In [5]:
input_file = 'all_csv/neutral_v4.csv'
output_file = 'all_csv/neutral_v5.csv'

# Open the input CSV file for reading and output file for writing
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        # Remove leading spaces from the start of each line
        cleaned_line = line.lstrip()
        outfile.write(cleaned_line)

print(f"Leading spaces have been removed from each line and saved to {output_file}")

Leading spaces have been removed from each line and saved to all_csv/neutral_v5.csv


In [None]:
import os

# Input CSV file and output directory
input_file = 'all_csv/neutral_v5.csv'
output_dir = 'datasets/data_v1/neutral'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Open the input CSV file for reading
with open(input_file, 'r') as infile:
    for i, line in enumerate(infile):
        # Define the output file name for each line
        output_file = os.path.join(output_dir, f'neutral_{i+1}.txt')
        
        # Write the line to a new text file
        with open(output_file, 'w') as outfile:
            outfile.write(line.strip())  # Remove any leading/trailing whitespace

print(f"Each line has been saved as a separate text file in the directory: {output_dir}")


In [1]:
import csv

# File paths
input_file = 'all_csv/neutral_v5.csv'  # Replace with the path to your input CSV file
output_file = 'all_csv/neutral_v6.csv'  # Replace with the desired path for the output file

# Read and write the first 400 lines
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Write the header and the first 400 rows
    for i, row in enumerate(reader):
        if i >= 400:
            break
        writer.writerow(row)

print(f"First 400 lines have been written to {output_file}")

First 400 lines have been written to all_csv/neutral_v6.csv
