In [2]:
# imports
import pandas as pd
import re
from math import ceil
import spacy

In [3]:
def clean_text(text):
    # Extended pattern to remove URLs, including non-standard patterns like 'pictwittercom / Wav1bacr5j'
    text_no_urls = re.sub(r'\b(?:http\S+|www\.\S+|[a-zA-Z0-9]+(?:[.\/]\S+)+)\b', '', text)
    
    # Remove spaces around slashes which are not typical in URLs but used in your examples
    text_no_extra_spaces = re.sub(r'\s+\/\s+', '/', text_no_urls)
    
    # Remove Punctuation except for @
    text_no_punctuation = re.sub(r'[.,:;!?"%$()[{@\'`\-]', '', text_no_extra_spaces)
    
    # Normalize Spaces
    text_normalized_spaces = re.sub(r'\s+', ' ', text_no_punctuation)
    
    # Trim spaces
    text_trimmed = text_normalized_spaces.strip()
    return text_trimmed

In [12]:
nlp = spacy.load("en_core_web_sm")

def clean_text_spacy(line):
    doc = nlp(line)
    filtered_tokens = [token.text for token in doc if token.is_alpha and not token.text.startswith('@')]
    return ' '.join(filtered_tokens)

In [21]:
# sentences_50-allagree

df = pd.read_csv('./data/train/Sentences_50Agree.txt', sep='\t', encoding='ISO-8859-1', header=None, names=['blaa'])
df_split = df['blaa'].str.split('.@', expand=True)
df_split.columns = ['text','sentiment']
df_split['text'] = df_split['text'].str.strip()
df_split['sentiment'] = df_split['sentiment'].str.strip()

formatted_and_cleaned_lines = df_split.apply(lambda row: clean_text_spacy(row[0]) + " @" + row[1].lower(), axis=1)

with open('./data/train/clean_train_texts/Sentences_50Agree_spacy.txt', 'w', encoding='utf-8') as file:
    for line in formatted_and_cleaned_lines:
        if len(line) > 10:  # Only write lines that are longer than the minimum length
            file.write(line + '\n')

  formatted_and_cleaned_lines = df_split.apply(lambda row: clean_text_spacy(row[0]) + " @" + row[1].lower(), axis=1)


In [15]:
# twitter_training.csv cleaning

# Load the DataFrame
column_names = ['0', '1', '2', '3']
df = pd.read_csv('./data/train/twitter_training.csv', header=None, names=column_names, encoding='ISO-8859-1')
df.drop(['0', '1'], axis=1, inplace=True)

# Filter out 'Irrelevant' values
df_no_irrelevant_values = df[df['2'] != 'Irrelevant']
df_no_irrelevant_values = df_no_irrelevant_values.dropna(subset=['2', '3'])  # Adjust column names as necessary
df_no_irrelevant_values = df_no_irrelevant_values.astype({'2': 'str', '3': 'str'})
# Apply transformations, cleaning, and check length in one step
formatted_and_cleaned_lines = df_no_irrelevant_values.apply(lambda row: clean_text_spacy(row[1]) + " @" + row[0].lower(), axis=1)

# Define a minimum length for text to be included
min_length = 20  # Example minimum length

# Write the cleaned, formatted, and length-checked lines to a file
with open('./data/train/clean_train_texts/twitter_training_clean_spacy.txt', 'w', encoding='utf-8') as file:
    for line in formatted_and_cleaned_lines:
        if len(line) > min_length:  # Only write lines that are longer than the minimum length
            file.write(line + '\n')

  formatted_and_cleaned_lines = df_no_irrelevant_values.apply(lambda row: clean_text_spacy(row[1]) + " @" + row[0].lower(), axis=1)


In [19]:
# train.csv cleaning

df = pd.read_csv('./data/train/train.csv', encoding='ISO-8859-1')
column_to_keep = ['selected_text','sentiment']
df = df[column_to_keep]
print(df.head())

value_counts = df['sentiment'].value_counts()
print(value_counts)
df = df.dropna(subset=['selected_text','sentiment'])
df = df.astype({'selected_text':'str','sentiment':'str'})
formatted_and_cleaned_lines = df.apply(lambda row: clean_text_spacy(row[0]) + " @" + row[1].lower(), axis=1)

with open('./data/train/clean_train_texts/twitter_training_2_clean_spacy.txt', 'w', encoding='utf-8') as file:
    for line in formatted_and_cleaned_lines:
        if len(line) > 10:  # Only write lines that are longer than the minimum length
            file.write(line + '\n')


                         selected_text sentiment
0  I`d have responded, if I were going   neutral
1                             Sooo SAD  negative
2                          bullying me  negative
3                       leave me alone  negative
4                        Sons of ****,  negative
sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64


  formatted_and_cleaned_lines = df.apply(lambda row: clean_text_spacy(row[0]) + " @" + row[1].lower(), axis=1)


In [20]:
# airline-sentiment-2-w-AA.csv cleaning

df = pd.read_csv('./data/train/Airline-sentiment-2-w-AA.csv', encoding='ISO-8859-1')
columns_to_keep = ['airline_sentiment','text']
df = df[columns_to_keep]
print(df.head())

df['text'] = df['text'].str.replace('@','', regex=False) \
                        .str.replace('VirginAmerica','', regex=False) \
                        .str.replace('united','', regex=False) \
                        .str.replace('SouthwestAir','',regex=False) \
                        .str.replace('JetBlue','', regex=False) \
                        .str.replace('USAirways','',regex=False) \
                        .str.replace('AmericanAir','',regex=False)
print(df.head())

value_counts = df['airline_sentiment'].value_counts()
print(value_counts)
df = df.dropna(subset=['airline_sentiment','text'])
df = df.astype({'airline_sentiment':'str','text':'str'})
formatted_and_cleaned_lines = df.apply(lambda row: clean_text_spacy(row[1]) + " @" + row[0].lower(), axis=1)

with open('./data/train/clean_train_texts/airline-sentiment-clean-spacy.txt','w', encoding='utf-8') as file:
    for line in formatted_and_cleaned_lines:
        if len(line) > 10:  # Only write lines that are longer than the minimum length
            file.write(line + '\n')

  airline_sentiment                                               text
0           neutral                @VirginAmerica What @dhepburn said.
1          positive  @VirginAmerica plus you've added commercials t...
2           neutral  @VirginAmerica I didn't today... Must mean I n...
3          negative  @VirginAmerica it's really aggressive to blast...
4          negative  @VirginAmerica and it's a really big bad thing...
  airline_sentiment                                               text
0           neutral                                What dhepburn said.
1          positive   plus you've added commercials to the experien...
2           neutral   I didn't today... Must mean I need to take an...
3          negative   it's really aggressive to blast obnoxious "en...
4          negative           and it's a really big bad thing about it
airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64


  formatted_and_cleaned_lines = df.apply(lambda row: clean_text_spacy(row[1]) + " @" + row[0].lower(), axis=1)


In [10]:
nlp = spacy.load("en_core_web_sm")

def clean_line(line):
    doc = nlp(line)
    filtered_tokens = [token.text for token in doc if token.is_alpha and not token.text.startswith('@')]
    # Join the tokens back into a string
    return ' '.join(filtered_tokens)

input_file_path = './data/train/twitter_training.csv'
output_file_path = './data/train/clean_train_texts/twitter_training_clean_spacy.txt'

with open(input_file_path, 'r', encoding='utf-8') as input_file, \
    open(output_file_path, 'a', encoding='utf-8') as output_file:
    for line in input_file:
        cleaned_line = clean_line(line)
        output_file.write(clean_line +)

One of our own is live Catch him here Say Streamer Shouts in chat for a chance to be in a coming Shout Out
