# Dataset preprocessing

## Determine the sentiment for the 1960s

In [None]:
import pandas as pd
import numpy as np

# Define function to determine final sentiment
def determine_sentiment(row):
    label_marin = row['label_marin']
    label_maud = row['label_maud']

    if label_marin == 0 and label_maud == 0:
        return 0
    elif label_marin == 1 and label_maud == 1:
        return 1
    elif label_marin == 2 and label_maud == 2:
        return 2
    elif label_marin == 0 and label_maud == 1:
        return 0
    elif label_marin == 1 and label_maud == 0:
        return 0
    elif label_marin == 1 and label_maud == 2:
        return 2
    elif label_marin == 2 and label_maud == 1:
        return 2
    elif label_marin == 0 and label_maud == 2:
        return None  # Indicating that the row should be removed
    elif label_marin == 2 and label_maud == 0:
        return None  # Indicating that the row should be removed

# List of input file names
input_files = ['1960s_coal_maud.csv', '1960s_gas_maud.csv', '1960s_oil_maud.csv']

# Process each dataset
for input_file in input_files:
    # Read the dataset
    df = pd.read_csv(input_file)

    # Select the desired columns and rename them
    df = df[['text', 'labels', 'labeler3']]
    df = df.rename(columns={'labels': 'label_marin', 'labeler3': 'label_maud'})

    # Apply the function to each row to determine final sentiment
    df['sentiment'] = df.apply(determine_sentiment, axis=1)

    # Drop rows where either 'label_marin' or 'label_maud' is None
    df.dropna(subset=['sentiment'], inplace=True)

    # Extract first and second words of the input file name
    input_file_name_parts = input_file.split('_')
    output_file_name = f"{input_file_name_parts[0]}_{input_file_name_parts[1]}.csv"

    # Generate the output file name
    output_file = f'{output_file_name}'

    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_file, index=False)

    print(f"The dataset {input_file} has been processed and saved as {output_file}")


The dataset 1960s_coal_maud.csv has been processed and saved as 1960s_coal.csv
The dataset 1960s_gas_maud.csv has been processed and saved as 1960s_gas.csv
The dataset 1960s_oil_maud.csv has been processed and saved as 1960s_oil.csv


## Filter datasets from Marin from the 1970s, 1980s and 1990s

In [None]:
import pandas as pd

# List of input file names
input_files = [
    'marin_1970s_coal_cleaned.csv', 'marin_1970s_gas_cleaned.csv',
    'marin_1970s_oil_cleaned.csv', 'marin_1980s_coal_cleaned.csv',
    'marin_1980s_gas_cleaned.csv', 'marin_1980s_oil_cleaned.csv',
    'marin_1990s_coal_cleaned.csv', 'marin_1990s_gas_cleaned.csv',
    'marin_1990s_oil_cleaned.csv'
]

# Process each file
for input_file in input_files:
    # Load the dataset
    df = pd.read_csv(input_file)

    # Select the 'labels' and 'text' columns and rename 'labels' to 'label_marin'
    df_selected = df[['text', 'labels']].rename(columns={'labels': 'label_marin'})

    # Generate the output file name
    output_file = f'filtered_{input_file}'

    # Save the resulting DataFrame to a new CSV file
    df_selected.to_csv(output_file, index=False)

    print(f"The dataset {input_file} has been filtered and saved as {output_file}")


The dataset marin_1970s_coal_cleaned.csv has been filtered and saved as filtered_marin_1970s_coal_cleaned.csv
The dataset marin_1970s_gas_cleaned.csv has been filtered and saved as filtered_marin_1970s_gas_cleaned.csv
The dataset marin_1970s_oil_cleaned.csv has been filtered and saved as filtered_marin_1970s_oil_cleaned.csv
The dataset marin_1980s_coal_cleaned.csv has been filtered and saved as filtered_marin_1980s_coal_cleaned.csv
The dataset marin_1980s_gas_cleaned.csv has been filtered and saved as filtered_marin_1980s_gas_cleaned.csv
The dataset marin_1980s_oil_cleaned.csv has been filtered and saved as filtered_marin_1980s_oil_cleaned.csv
The dataset marin_1990s_coal_cleaned.csv has been filtered and saved as filtered_marin_1990s_coal_cleaned.csv
The dataset marin_1990s_gas_cleaned.csv has been filtered and saved as filtered_marin_1990s_gas_cleaned.csv
The dataset marin_1990s_oil_cleaned.csv has been filtered and saved as filtered_marin_1990s_oil_cleaned.csv


## Filter datasets from Edo from the 1970s, 1980s and 1990s

In [None]:
import pandas as pd

# List of input file names
input_files = [
    'edo_1970s_coal_cleaned.csv', 'edo_1970s_gas_cleaned.csv',
    'edo_1970s_oil_cleaned.csv', 'edo_1980s_coal_cleaned.csv',
    'edo_1980s_gas_cleaned.csv', 'edo_1980s_oil_cleaned.csv',
    'edo_1990s_coal_cleaned.csv', 'edo_1990s_gas_cleaned.csv',
    'edo_1990s_oil_cleaned.csv'
]

# Process each file
for input_file in input_files:
    # Load the dataset
    df = pd.read_csv(input_file)

    # Select the 'labels' and 'text' columns and rename 'labels' to 'label_marin'
    df_selected = df[['text', 'labels']].rename(columns={'labels': 'label_edo'})

    # Generate the output file name
    output_file = f'filtered_{input_file}'

    # Save the resulting DataFrame to a new CSV file
    df_selected.to_csv(output_file, index=False)

    print(f"The dataset {input_file} has been filtered and saved as {output_file}")


The dataset edo_1970s_coal_cleaned.csv has been filtered and saved as filtered_edo_1970s_coal_cleaned.csv
The dataset edo_1970s_gas_cleaned.csv has been filtered and saved as filtered_edo_1970s_gas_cleaned.csv
The dataset edo_1970s_oil_cleaned.csv has been filtered and saved as filtered_edo_1970s_oil_cleaned.csv
The dataset edo_1980s_coal_cleaned.csv has been filtered and saved as filtered_edo_1980s_coal_cleaned.csv
The dataset edo_1980s_gas_cleaned.csv has been filtered and saved as filtered_edo_1980s_gas_cleaned.csv
The dataset edo_1980s_oil_cleaned.csv has been filtered and saved as filtered_edo_1980s_oil_cleaned.csv
The dataset edo_1990s_coal_cleaned.csv has been filtered and saved as filtered_edo_1990s_coal_cleaned.csv
The dataset edo_1990s_gas_cleaned.csv has been filtered and saved as filtered_edo_1990s_gas_cleaned.csv
The dataset edo_1990s_oil_cleaned.csv has been filtered and saved as filtered_edo_1990s_oil_cleaned.csv


## Filter datasets from Maud from the 1970s, 1980s and 1990s

In [None]:
import pandas as pd

# List of input file names
input_files = [
    '1970s_coal_maud.csv', '1970s_gas_maud.csv', '1970s_oil_maud.csv',
    '1980s_coal_maud.csv', '1980s_gas_maud.csv', '1980s_oil_maud.csv',
    '1990s_coal_maud.csv', '1990s_gas_maud.csv', '1990s_oil_maud.csv'
]

# Process each file
for input_file in input_files:
    # Load the dataset
    df = pd.read_csv(input_file)

    # Select the 'text_split' and 'labeler3' columns and rename them
    df_selected = df[['text_split', 'labeler3']].rename(columns={'text_split': 'text', 'labeler3': 'label_maud'})

    # Generate the output file name
    output_file = f'filtered_{input_file}'

    # Save the resulting DataFrame to a new CSV file
    df_selected.to_csv(output_file, index=False)

    print(f"The dataset {input_file} has been filtered, columns renamed, and saved as {output_file}")


The dataset 1970s_coal_maud.csv has been filtered, columns renamed, and saved as filtered_1970s_coal_maud.csv
The dataset 1970s_gas_maud.csv has been filtered, columns renamed, and saved as filtered_1970s_gas_maud.csv
The dataset 1970s_oil_maud.csv has been filtered, columns renamed, and saved as filtered_1970s_oil_maud.csv
The dataset 1980s_coal_maud.csv has been filtered, columns renamed, and saved as filtered_1980s_coal_maud.csv
The dataset 1980s_gas_maud.csv has been filtered, columns renamed, and saved as filtered_1980s_gas_maud.csv
The dataset 1980s_oil_maud.csv has been filtered, columns renamed, and saved as filtered_1980s_oil_maud.csv
The dataset 1990s_coal_maud.csv has been filtered, columns renamed, and saved as filtered_1990s_coal_maud.csv
The dataset 1990s_gas_maud.csv has been filtered, columns renamed, and saved as filtered_1990s_gas_maud.csv
The dataset 1990s_oil_maud.csv has been filtered, columns renamed, and saved as filtered_1990s_oil_maud.csv


## Merge the filtered datasets from Marin and Edo from the 1970s, 1980s and 1990s

In [None]:
import pandas as pd

# List of input file pairs
input_file_pairs = [
    ('filtered_marin_1970s_coal_cleaned.csv', 'filtered_edo_1970s_coal_cleaned.csv'),
    ('filtered_marin_1970s_gas_cleaned.csv', 'filtered_edo_1970s_gas_cleaned.csv'),
    ('filtered_marin_1970s_oil_cleaned.csv', 'filtered_edo_1970s_oil_cleaned.csv'),
    ('filtered_marin_1980s_coal_cleaned.csv', 'filtered_edo_1980s_coal_cleaned.csv'),
    ('filtered_marin_1980s_gas_cleaned.csv', 'filtered_edo_1980s_gas_cleaned.csv'),
    ('filtered_marin_1980s_oil_cleaned.csv', 'filtered_edo_1980s_oil_cleaned.csv'),
    ('filtered_marin_1990s_coal_cleaned.csv', 'filtered_edo_1990s_coal_cleaned.csv'),
    ('filtered_marin_1990s_gas_cleaned.csv', 'filtered_edo_1990s_gas_cleaned.csv'),
    ('filtered_marin_1990s_oil_cleaned.csv', 'filtered_edo_1990s_oil_cleaned.csv')
]

# Process each pair of files
for part1, part2 in input_file_pairs:
    # Load the datasets
    df1 = pd.read_csv(part1)
    df2 = pd.read_csv(part2)

    # Merge the datasets on the 'text' column
    merged_df = pd.merge(df1, df2, on='text', how='outer')

    # Extract parts 3 and 4 from the first filename
    part1_split = part1.split('_')
    if len(part1_split) >= 4:
        # Modify to include all necessary parts
        output_file = f'merged_{part1_split[2]}_{part1_split[3]}'
    else:
        output_file = f'merged_{part1}'

    # Ensure the output file has a .csv extension
    if not output_file.endswith('.csv'):
        output_file += '.csv'

    # Save the resulting DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)

    print(f"The datasets {part1} and {part2} have been merged and saved as {output_file}")


The datasets filtered_marin_1970s_coal_cleaned.csv and filtered_edo_1970s_coal_cleaned.csv have been merged and saved as merged_1970s_coal.csv
The datasets filtered_marin_1970s_gas_cleaned.csv and filtered_edo_1970s_gas_cleaned.csv have been merged and saved as merged_1970s_gas.csv
The datasets filtered_marin_1970s_oil_cleaned.csv and filtered_edo_1970s_oil_cleaned.csv have been merged and saved as merged_1970s_oil.csv
The datasets filtered_marin_1980s_coal_cleaned.csv and filtered_edo_1980s_coal_cleaned.csv have been merged and saved as merged_1980s_coal.csv
The datasets filtered_marin_1980s_gas_cleaned.csv and filtered_edo_1980s_gas_cleaned.csv have been merged and saved as merged_1980s_gas.csv
The datasets filtered_marin_1980s_oil_cleaned.csv and filtered_edo_1980s_oil_cleaned.csv have been merged and saved as merged_1980s_oil.csv
The datasets filtered_marin_1990s_coal_cleaned.csv and filtered_edo_1990s_coal_cleaned.csv have been merged and saved as merged_1990s_coal.csv
The dataset

## Merge the filtered datasets from Marin, Edo and Maud from the 1970s, 1980s and 1990s

In [None]:
import pandas as pd

# List of input file pairs
input_file_pairs = [
    ('merged_1970s_coal.csv', 'filtered_1970s_coal_maud.csv'),
    ('merged_1970s_gas.csv', 'filtered_1970s_gas_maud.csv'),
    ('merged_1970s_oil.csv', 'filtered_1970s_oil_maud.csv'),
    ('merged_1980s_coal.csv', 'filtered_1980s_coal_maud.csv'),
    ('merged_1980s_gas.csv', 'filtered_1980s_gas_maud.csv'),
    ('merged_1980s_oil.csv', 'filtered_1980s_oil_maud.csv'),
    ('merged_1990s_coal.csv', 'filtered_1990s_coal_maud.csv'),
    ('merged_1990s_gas.csv', 'filtered_1990s_gas_maud.csv'),
    ('merged_1990s_oil.csv', 'filtered_1990s_oil_maud.csv')
]

# Process each pair of files
for part1, part2 in input_file_pairs:
    # Load the datasets
    df1 = pd.read_csv(part1)
    df2 = pd.read_csv(part2)

    # Merge the datasets on the 'text' column
    merged_df = pd.merge(df1, df2, on='text', how='outer')

    # Extract parts 3 and 4 from the first filename
    part1_split = part1.split('_')
    if len(part1_split) >= 4:
        # Modify to include all necessary parts
        output_file = f'final_{part1_split[1]}_{part1_split[2]}'
    else:
        output_file = f'final_{part1}'

    # Ensure the output file has a .csv extension
    if not output_file.endswith('.csv'):
        output_file += '.csv'

    # Save the resulting DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)

    print(f"The datasets {part1} and {part2} have been merged and saved as {output_file}")


The datasets merged_1970s_coal.csv and filtered_1970s_coal_maud.csv have been merged and saved as final_merged_1970s_coal.csv
The datasets merged_1970s_gas.csv and filtered_1970s_gas_maud.csv have been merged and saved as final_merged_1970s_gas.csv
The datasets merged_1970s_oil.csv and filtered_1970s_oil_maud.csv have been merged and saved as final_merged_1970s_oil.csv
The datasets merged_1980s_coal.csv and filtered_1980s_coal_maud.csv have been merged and saved as final_merged_1980s_coal.csv
The datasets merged_1980s_gas.csv and filtered_1980s_gas_maud.csv have been merged and saved as final_merged_1980s_gas.csv
The datasets merged_1980s_oil.csv and filtered_1980s_oil_maud.csv have been merged and saved as final_merged_1980s_oil.csv
The datasets merged_1990s_coal.csv and filtered_1990s_coal_maud.csv have been merged and saved as final_merged_1990s_coal.csv
The datasets merged_1990s_gas.csv and filtered_1990s_gas_maud.csv have been merged and saved as final_merged_1990s_gas.csv
The dat

## Determining sentiment for the 1970s, 1980s and 1990s

In [None]:
import pandas as pd

# List of input file names
input_files = [
    'final_merged_1970s_coal.csv', 'final_merged_1970s_gas.csv',
    'final_merged_1970s_oil.csv', 'final_merged_1980s_coal.csv',
    'final_merged_1980s_gas.csv', 'final_merged_1980s_oil.csv',
    'final_merged_1990s_coal.csv', 'final_merged_1990s_gas.csv',
    'final_merged_1990s_oil.csv'
]

# Process each dataset
for input_file in input_files:
    # Read the dataset
    df = pd.read_csv(input_file)

    # Drop rows where either 'label_marin' or 'label_edo' is missing
    df = df.dropna(subset=['label_marin', 'label_edo'])

    # Define a function to determine final sentiment based on the rules
    def determine_sentiment(row):
        label_marin = row['label_marin']
        label_edo = row['label_edo']
        label_maud = row['label_maud']

        # Apply rules for rows with two labels
        if pd.notna(label_marin) and pd.notna(label_edo):
            if label_marin == 0 and label_edo == 0:
                return 0
            elif label_marin == 1 and label_edo == 1:
                return 1
            elif label_marin == 2 and label_edo == 2:
                return 2
            elif label_marin == 0 and label_edo == 1:
                return 0
            elif label_marin == 1 and label_edo == 0:
                return 0
            elif label_marin == 1 and label_edo == 2:
                return 2
            elif label_marin == 2 and label_edo == 1:
                return 2
            elif label_marin == 0 and label_edo == 2:
                return label_maud
            elif label_marin == 2 and label_edo == 0:
                return label_maud

    # Apply the function to each row to determine final sentiment
    df['sentiment'] = df.apply(determine_sentiment, axis=1)

    # Remove rows where final sentiment could not be determined
    df = df.dropna(subset=['sentiment'])

    # Generate the output file name
    output_file = f"{input_file.split('_')[2]}_{input_file.split('_')[3]}"

    # Save the filtered DataFrame to a new CSV file
    df.to_csv(output_file, index=False)

    print(f"The dataset {input_file} has been filtered and saved as {output_file}")


The dataset final_merged_1970s_coal.csv has been filtered and saved as 1970s_coal.csv
The dataset final_merged_1970s_gas.csv has been filtered and saved as 1970s_gas.csv
The dataset final_merged_1970s_oil.csv has been filtered and saved as 1970s_oil.csv
The dataset final_merged_1980s_coal.csv has been filtered and saved as 1980s_coal.csv
The dataset final_merged_1980s_gas.csv has been filtered and saved as 1980s_gas.csv
The dataset final_merged_1980s_oil.csv has been filtered and saved as 1980s_oil.csv
The dataset final_merged_1990s_coal.csv has been filtered and saved as 1990s_coal.csv
The dataset final_merged_1990s_gas.csv has been filtered and saved as 1990s_gas.csv
The dataset final_merged_1990s_oil.csv has been filtered and saved as 1990s_oil.csv


## Remove duplicates from all datasets

In [None]:
import pandas as pd

# Filenames of the datasets
filenames = ['1960s_gas.csv', '1970s_gas.csv', '1980s_gas.csv', '1990s_gas.csv']

# Function to remove duplicate rows based on 'text' and 'sentiment' columns
def remove_duplicate_rows(df):
    return df.drop_duplicates(subset=['text', 'sentiment'])

# Process each file
for filename in filenames:
    # Read the dataset
    df = pd.read_csv(filename)

    # Remove duplicate rows
    cleaned_df = remove_duplicate_rows(df)

    # Save the cleaned dataset back to a new CSV file
    cleaned_filename = 'cleaned_' + filename
    cleaned_df.to_csv(cleaned_filename, index=False)

    print(f"Processed {filename}, saved cleaned data to {cleaned_filename}")



Processed 1960s_gas.csv, saved cleaned data to cleaned_1960s_gas.csv
Processed 1970s_gas.csv, saved cleaned data to cleaned_1970s_gas.csv
Processed 1980s_gas.csv, saved cleaned data to cleaned_1980s_gas.csv
Processed 1990s_gas.csv, saved cleaned data to cleaned_1990s_gas.csv
