# Data Cleansing

In [1]:
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


In [2]:
import csv

# Function to write dataframe to csv file
def saveToCsv(fileName, df):
    with open(f'../data/{fileName}.csv', 'w', encoding='UTF8', newline='') as f:
        solarWriter = csv.writer(f)
        solarWriter.writerow(df.columns)
        solarWriter.writerows(df.values)

In [3]:
# Read in data from csv files
solar = pd.read_csv('../data/solar.csv')
ebikes = pd.read_csv('../data/ebikes.csv')

In [4]:
# Remove new line characters
ebikes['content'] = ebikes['content'].str.replace(r"\n", "")
ebikes['content'] = ebikes['content'].str.replace(r"\r", "")
solar['content'] = solar['content'].str.replace(r"\n", "")
solar['content'] = solar['content'].str.replace(r"\r", "")

  ebikes['content'] = ebikes['content'].str.replace(r"\n", "")
  ebikes['content'] = ebikes['content'].str.replace(r"\r", "")
  solar['content'] = solar['content'].str.replace(r"\n", "")
  solar['content'] = solar['content'].str.replace(r"\r", "")


In [5]:
# Add labels to data
solar['label'] = 'solar'
ebikes['label'] = 'ebikes'

In [6]:
# Print lenght of solar and length of ebikes datasets
print(f'len(solar) = {len(solar)}, len(ebikes) = {len(ebikes)}')

# Take first 1032 rows of solar dataset (to have same number of rows as ebikes dataset)
solar = solar.head(1032)
print(f'len(solar) = {len(solar)}, len(ebikes) = {len(ebikes)}')

len(solar) = 1204, len(ebikes) = 1032
len(solar) = 1032, len(ebikes) = 1032


# Classification Models

### Naive Bayes with Count Vectorizer

In [7]:
# Split the data into train and test sets (80% for training and 20% for testing)
solar_train, solar_test = train_test_split(solar, test_size=0.2, random_state=42)
ebikes_train, ebikes_test = train_test_split(ebikes, test_size=0.2, random_state=42)

In [8]:
# Join the solar and ebikes sets for train and test
train = pd.concat([solar_train, ebikes_train]).reset_index(drop=True)
test = pd.concat([solar_test, ebikes_test]).reset_index(drop=True)

In [9]:
# Extract the content and label columns from the dataframes
train_data = train["content"].values
train_labels = train["label"].values
test_data = test["content"].values
test_labels = test["label"].values

# Conversione of texts into features
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data)
test_features = count_vect.transform(test_data)

# Training 
classifier = MultinomialNB()
classifier.fit(X_train_counts, train_labels)

# Test
predictions = classifier.predict(test_features)

target_names = ['ebikes', 'solar']
print(classification_report(test_labels, predictions, target_names=target_names))

              precision    recall  f1-score   support

      ebikes       1.00      0.99      1.00       207
       solar       0.99      1.00      1.00       207

    accuracy                           1.00       414
   macro avg       1.00      1.00      1.00       414
weighted avg       1.00      1.00      1.00       414



# Baseline

Classification based on the number of occurrencies of the name of the category itself or of it's minor variations, i.e. plural forms or different ways of spelling it (e.g. 'bike' / 'ebike' / 'e-bike' / 'bicycle' / 'e-bicycle').

In [19]:
import nltk
import string

# Creates and returns the vocabulary of a given document
def create_vocabulary(document, remove_stop_words=False, remove_punctuation=False, remove_numbers=False, remove_duplicates=False, docLanguage='english'):
    
    tokens = nltk.word_tokenize(document, language=docLanguage)
    stop_words = set(nltk.corpus.stopwords.words(docLanguage)) if remove_stop_words else []
    punctuation = set(string.punctuation) if remove_punctuation else []

    vocabulary = [t.lower() for t in tokens 
                  if not ((t.lower() in stop_words)
                  or (t.lower() in punctuation)
                  or (t.lower().isdigit() and remove_numbers))]
    
    if remove_duplicates:
        return list(set(vocabulary))
    else:
        return vocabulary

In [12]:
# Classify a text based on its vocabulary into one of the given classes
def classify_text(text, classes):
    vocabulary = create_vocabulary(text, True, True, True)

    class_counts = {}
    for cl in classes:
        class_counts[cl] = 0

    for word in vocabulary:
        for cl in classes:
            if word in classes[cl]:
                class_counts[cl] += 1

    return max(class_counts, key=class_counts.get)

In [13]:
# Classify the given texts into the given classes and evaluate the results with the main classification metrics
def classify_texts_and_evaluate(texts, classes):
    texts['baseline'] = texts['content'].apply(lambda x: classify_text(x, classes))
    texts['baseline'].value_counts(normalize=True)
    
    print(classification_report(texts['label'].values, texts['baseline'].values, target_names=sorted(classes.keys())))

In [14]:
# Split the data into train and test sets (80% for training and 20% for testing)
solar_train, solar_test = train_test_split(solar, test_size=0.2, random_state=42)
ebikes_train, ebikes_test = train_test_split(ebikes, test_size=0.2, random_state=42)

In [15]:
# Join the solar and ebikes sets for train and test
train = pd.concat([solar_train, ebikes_train]).reset_index(drop=True)
test = pd.concat([solar_test, ebikes_test]).reset_index(drop=True)


classes = {'solar': ['panels', 'panel'], 'ebikes': ['ebike', 'ebikes', 'bike', 'bikes', 'e-bike', 'e-bikes', 'bicycle', 'bicycles', 'e-bicycle', 'e-bicycles']}

In [16]:
# Trying to classify some dummy texts with the baseline approach
dummy_texts = pd.DataFrame({
    'content': ['I like solar panels', 'A friend of mine just bought a really expensive e-bike', 'I like e-bicycles but I like solar panels too, because with a solar panel I can recharge all my electric gadgets.', 'I want to be self-sufficient and respect the environment.', 'My two-wheels is so cool!'], 
    'label': ['solar', 'ebikes', 'ebikes', 'solar', 'ebikes']})
classify_texts_and_evaluate(dummy_texts, classes)

              precision    recall  f1-score   support

      ebikes       1.00      0.33      0.50         3
       solar       0.50      1.00      0.67         2

    accuracy                           0.60         5
   macro avg       0.75      0.67      0.58         5
weighted avg       0.80      0.60      0.57         5



In [17]:
# Now let's try with the real test set
classify_texts_and_evaluate(test, classes)

              precision    recall  f1-score   support

      ebikes       1.00      1.00      1.00       207
       solar       1.00      1.00      1.00       207

    accuracy                           1.00       414
   macro avg       1.00      1.00      1.00       414
weighted avg       1.00      1.00      1.00       414



In [18]:
# Show the texts that were misclassified
test.loc[test['baseline'] != test['label']]

Unnamed: 0,date,content,label,baseline
86,2020/03/31,In today’s Electrek Green Energy Brief (EGEB):...,solar,ebikes
