# Data Cleansing

In [1]:
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [50]:
import csv

# Function to write dataframe to csv file
def saveToCsv(fileName, df):
    with open(f'../data/{fileName}.csv', 'w', encoding='UTF8', newline='') as f:
        solarWriter = csv.writer(f)
        solarWriter.writerow(df.columns)
        solarWriter.writerows(df.values)

In [2]:
# Read in data from csv files
solar = pd.read_csv('../data/solar.csv')
ebikes = pd.read_csv('../data/ebikes.csv')
tesla = pd.read_csv('../data/tesla.csv')

In [3]:
# Remove new line characters
ebikes['content'] = ebikes['content'].str.replace(r"\n", "")
ebikes['content'] = ebikes['content'].str.replace(r"\r", "")
solar['content'] = solar['content'].str.replace(r"\n", "")
solar['content'] = solar['content'].str.replace(r"\r", "")
tesla['content'] = tesla['content'].str.replace(r"\n", "")
tesla['content'] = tesla['content'].str.replace(r"\r", "")

  ebikes['content'] = ebikes['content'].str.replace(r"\n", "")
  ebikes['content'] = ebikes['content'].str.replace(r"\r", "")
  solar['content'] = solar['content'].str.replace(r"\n", "")
  solar['content'] = solar['content'].str.replace(r"\r", "")
  tesla['content'] = tesla['content'].str.replace(r"\n", "")
  tesla['content'] = tesla['content'].str.replace(r"\r", "")


In [4]:
# Add labels to data
solar['label'] = 'solar'
ebikes['label'] = 'ebikes'
tesla['label'] = 'tesla'

In [5]:
# Print lenght of the datasets
print(f'len(solar) = {len(solar)}, len(ebikes) = {len(ebikes)}, len(tesla) = {len(tesla)}')

# Take first 1032 rows of the solar and Tesla datasets (to have the same number of rows as the ebikes dataset)
solar = solar.head(1032)
tesla = tesla.head(1032)
print(f'len(solar) = {len(solar)}, len(ebikes) = {len(ebikes)}, len(tesla) = {len(tesla)}')

len(solar) = 1204, len(ebikes) = 1032, len(tesla) = 1202
len(solar) = 1032, len(ebikes) = 1032, len(tesla) = 1032


# Classification Models

### Naive Bayes with Count Vectorizer

In [6]:
# Split the data into train and test sets (80% for training and 20% for testing)
solar_train, solar_test = train_test_split(solar, test_size=0.2, random_state=42)
ebikes_train, ebikes_test = train_test_split(ebikes, test_size=0.2, random_state=42)
tesla_train, tesla_test = train_test_split(tesla, test_size=0.2, random_state=42)

In [7]:
# Join the sets for train and test
train = pd.concat([solar_train, ebikes_train, tesla_train]).reset_index(drop=True)
test = pd.concat([solar_test, ebikes_test, tesla_test]).reset_index(drop=True)

In [8]:
# Extract the content and label columns from the dataframes
train_data = train["content"].values
train_labels = train["label"].values
test_data = test["content"].values
test_labels = test["label"].values

# Conversione of texts into features
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data)
test_features = count_vect.transform(test_data)

# Training 
classifier = MultinomialNB()
classifier.fit(X_train_counts, train_labels)

# Test
predictions = classifier.predict(test_features)

target_names = ['ebikes', 'solar', 'tesla']
print(classification_report(test_labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

      ebikes       1.00      0.99      1.00       207
       solar       0.97      1.00      0.98       207
       tesla       1.00      0.98      0.99       207

    accuracy                           0.99       621
   macro avg       0.99      0.99      0.99       621
weighted avg       0.99      0.99      0.99       621



# Baseline

Classification based on the number of occurrencies of the name of the category itself or of it's minor variations, i.e. plural forms or different ways of spelling it (e.g. 'bike' / 'ebike' / 'e-bike' / 'bicycle' / 'e-bicycle').

In [9]:
import nltk
import string

# Creates and returns the vocabulary of a given document
def create_vocabulary(document, remove_stop_words=False, remove_punctuation=False, remove_numbers=False, remove_duplicates=False, docLanguage='english'):
    
    tokens = nltk.word_tokenize(document, language=docLanguage)
    stop_words = set(nltk.corpus.stopwords.words(docLanguage)) if remove_stop_words else []
    punctuation = set(string.punctuation) if remove_punctuation else []

    vocabulary = [t.lower() for t in tokens 
                  if not ((t.lower() in stop_words)
                  or (t.lower() in punctuation)
                  or (t.lower().isdigit() and remove_numbers))]
    
    if remove_duplicates:
        return list(set(vocabulary))
    else:
        return vocabulary

In [10]:
# Classify a text based on its vocabulary into one of the given classes
def classify_text(text, classes):
    vocabulary = create_vocabulary(text, True, True, True)

    class_counts = {}
    for cl in classes:
        class_counts[cl] = 0

    for word in vocabulary:
        for cl in classes:
            if word in classes[cl]:
                class_counts[cl] += 1

    return max(class_counts, key=class_counts.get)

In [11]:
# Classify the given texts into the given classes and evaluate the results with the main classification metrics
def classify_texts_and_evaluate(texts, classes):
    texts['baseline'] = texts['content'].apply(lambda x: classify_text(x, classes))
    texts['baseline'].value_counts(normalize=True)
    
    print(classification_report(texts['label'].values, texts['baseline'].values, target_names=sorted(classes.keys())))

In [12]:
# Split the data into train and test sets (80% for training and 20% for testing)
solar_train, solar_test = train_test_split(solar, test_size=0.2, random_state=42)
ebikes_train, ebikes_test = train_test_split(ebikes, test_size=0.2, random_state=42)
tesla_train, tesla_test = train_test_split(tesla, test_size=0.2, random_state=42)

In [13]:
# Join the sets for train and test
train = pd.concat([solar_train, ebikes_train, tesla_train]).reset_index(drop=True)
test = pd.concat([solar_test, ebikes_test, tesla_test]).reset_index(drop=True)

# Define the classes and their keywords
classes = {'solar': ['panels', 'panel'], 'ebikes': ['ebike', 'ebikes', 'bike', 'bikes', 'e-bike', 'e-bikes', 'bicycle', 'bicycles', 'e-bicycle', 'e-bicycles'],  'tesla': ['tesla', 'model', 'elon', 'musk']}

In [14]:
# Trying to classify some dummy texts with the baseline approach
dummy_texts = pd.DataFrame({
    'content': ['I like solar panels', 'A friend of mine just bought a really expensive e-bike', 'I like e-bicycles but I like solar panels too, because with a solar panel I can recharge all my electric gadgets.', 'I want to be self-sufficient and respect the environment.', 'My two-wheels is so cool!', 'Elon Musk is the best', 'I want to buy a Tesla', 'I tried the Model 3 and it was amazing'], 
    'label': ['solar', 'ebikes', 'ebikes', 'solar', 'ebikes', 'tesla', 'tesla', 'tesla']})
classify_texts_and_evaluate(dummy_texts, classes)

              precision    recall  f1-score   support

      ebikes       1.00      0.33      0.50         3
       solar       0.50      1.00      0.67         2
       tesla       1.00      1.00      1.00         3

    accuracy                           0.75         8
   macro avg       0.83      0.78      0.72         8
weighted avg       0.88      0.75      0.73         8



In [16]:
# Now let's try with the real test set
classify_texts_and_evaluate(test, classes)

              precision    recall  f1-score   support

      ebikes       1.00      1.00      1.00       207
       solar       0.93      0.56      0.70       207
       tesla       0.69      0.96      0.80       207

    accuracy                           0.84       621
   macro avg       0.87      0.84      0.83       621
weighted avg       0.87      0.84      0.83       621



In [17]:
# Show the texts that were misclassified
test.loc[test['baseline'] != test['label']]

Unnamed: 0,date,content,label,baseline
0,2022/11/21,Quaise Energy is on a mission to prove that de...,solar,tesla
1,2022/12/29,"The world’s first floating offshore wind farm,...",solar,tesla
4,2023/03/16,Tesla is launching a new feature to help homeo...,solar,tesla
5,2023/03/16,Tesla is launching a new feature to help homeo...,solar,tesla
6,2022/01/19,The US Department of Interior’s Bureau of Ocea...,solar,tesla
...,...,...,...,...
490,2023/03/23,Lucid Motors is in the process of delivering a...,tesla,solar
499,2023/03/23,Lucid Motors is in the process of delivering a...,tesla,solar
546,2023/03/23,Lucid Motors is in the process of delivering a...,tesla,solar
582,2023/03/23,Lucid Motors is in the process of delivering a...,tesla,solar
