# Lexicon-based approaches

## Pattern.nl

In [None]:
# Install library
!pip install pattern

Collecting pattern
  Downloading Pattern-3.6.0.tar.gz (22.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting backports.csv (from pattern)
  Downloading backports.csv-1.0.7-py2.py3-none-any.whl (12 kB)
Collecting mysqlclient (from pattern)
  Downloading mysqlclient-2.2.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.4/90.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting feedparser (from pattern)
  Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m


In [None]:
import pandas as pd
from pattern.nl import sentiment
from sklearn.metrics import classification_report, confusion_matrix

# Replace the numerical labels with the sentiment categories
def map_labels(label):
    if label == 0:
        return "negative"
    elif label == 1:
        return "neutral"
    elif label == 2:
        return "positive"
    else:
        return "unknown"

# Create function to perform sentiment analysis and generate classification report and confusion matrix
def analyze_sentiment_and_report(dataset):
    predicted_labels = []
    for text in dataset["text"]:
        polarity, _ = sentiment(text)
        predicted_label = "positive" if polarity > 0 else "negative" if polarity < 0 else "neutral" # Threshold for neutral is 0
        predicted_labels.append(predicted_label)

    # Map numerical ground truth labels to sentiment categories
    ground_truth_labels = dataset["labels"].apply(map_labels)

    # Create classification report
    report = classification_report(ground_truth_labels, predicted_labels)

    # Create confusion matrix
    conf_matrix = confusion_matrix(ground_truth_labels, predicted_labels, labels=["negative", "neutral", "positive"])

    return report, conf_matrix

# Paths to datasets
dataset_paths = ["1960s_gas.csv", "1970s_gas.csv",
                 "1980s_gas.csv", "1990s_gas.csv"]

# Iterate over each dataset path in the list
for dataset_path in dataset_paths:
    dataset_name = dataset_path.split(".")[0]
    dataset = pd.read_csv(dataset_path)
    print(f"Classification Report for {dataset_name}:")
    report, conf_matrix = analyze_sentiment_and_report(dataset)
    print(report)
    print("Confusion Matrix:")
    print(conf_matrix)
    print("-" * 50)  # Separating reports


Classification Report for 1960s_gas:
              precision    recall  f1-score   support

    negative       0.21      0.26      0.23        82
     neutral       0.50      0.03      0.06       131
    positive       0.54      0.79      0.64       220

    accuracy                           0.46       433
   macro avg       0.42      0.36      0.31       433
weighted avg       0.46      0.46      0.39       433

Confusion Matrix:
[[ 21   0  61]
 [ 37   4  90]
 [ 42   4 174]]
--------------------------------------------------
Classification Report for 1970s_gas:
              precision    recall  f1-score   support

    negative       0.27      0.42      0.33        19
     neutral       0.00      0.00      0.00        22
    positive       0.58      0.69      0.63        55

    accuracy                           0.48        96
   macro avg       0.28      0.37      0.32        96
weighted avg       0.39      0.48      0.43        96

Confusion Matrix:
[[ 8  0 11]
 [ 6  0 16]
 [16  1

## LUPJE

In [None]:
import pandas as pd
from sklearn.metrics import classification_report

# Read the text file into a DataFrame, skipping lines with incorrect formatting
words_sentiment_df = pd.DataFrame(columns=["word", "sentiment_score"])

with open("LUPJE.txt", "r") as file:
    for line in file:
        try:
            word, sentiment_score = line.strip().split("\t")
            words_sentiment_df = pd.concat([words_sentiment_df, pd.DataFrame({"word": [word], "sentiment_score": [sentiment_score]})])
        except ValueError:
            print(f"Skipping line with incorrect formatting: {line.strip()}")

# Convert sentiment scores to numeric type
words_sentiment_df["sentiment_score"] = pd.to_numeric(words_sentiment_df["sentiment_score"])

# Function that performs the sentiment analysis
def analyze_sentiment(text):
    # Tokenize the text
    tokens = text.split()
    sentiment_score = 0

    # Calculate sentiment score based on words in the text
    for token in tokens:
        if token in words_sentiment_df["word"].values:
            sentiment_score += words_sentiment_df.loc[words_sentiment_df["word"] == token, "sentiment_score"].values[0]

    # Determine sentiment label based on sentiment score
    if sentiment_score > 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"
    else:
        return "neutral"

# Perform sentiment analysis on the dataset
def perform_sentiment_analysis(dataset):
    # Map numerical labels to sentiment categories
    dataset["true_sentiment"] = dataset["labels"].map({0: "negative", 1: "neutral", 2: "positive"})

    # Apply sentiment analysis to the text column
    dataset["predicted_sentiment"] = dataset["text"].apply(analyze_sentiment)
    return dataset

# Function that performs sentiment analysis and generates a classification report for each dataset
def generate_classification_report(dataset):
    result_dataset = perform_sentiment_analysis(dataset)

    # Generate classification report
    report = classification_report(result_dataset["true_sentiment"], result_dataset["predicted_sentiment"])

    return report

# Paths to datasets
dataset_paths = ["1960s_gas.csv", "1970s_gas.csv",
                 "1980s_gas.csv", "1990s_gas.csv"]

# Iterate over each dataset path in the list
for dataset_path in dataset_paths:
    dataset_name = dataset_path.split(".")[0]
    your_dataset = pd.read_csv(dataset_path)
    print(f"Classification Report for {dataset_name}:")
    report = generate_classification_report(your_dataset)
    print(report)
    print("-" * 50)


Classification Report for 1960s_gas:
              precision    recall  f1-score   support

    negative       0.20      0.30      0.25        82
     neutral       0.43      0.07      0.12       131
    positive       0.52      0.69      0.59       220

    accuracy                           0.43       433
   macro avg       0.38      0.35      0.32       433
weighted avg       0.43      0.43      0.38       433

--------------------------------------------------
Classification Report for 1970s_gas:
              precision    recall  f1-score   support

    negative       0.11      0.21      0.15        19
     neutral       0.17      0.05      0.07        22
    positive       0.50      0.49      0.50        55

    accuracy                           0.33        96
   macro avg       0.26      0.25      0.24        96
weighted avg       0.35      0.33      0.33        96

--------------------------------------------------
Classification Report for 1980s_gas:
              precision  