# Lexicon-based approaches

## Pattern.nl

In [7]:
# Install library
!pip install pattern
!pip install nltk
!pip install scikit-learn



In [5]:
import pandas as pd
from pattern.nl import sentiment
from sklearn.metrics import classification_report, confusion_matrix
import re
import string
import nltk

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# List of Dutch stop words
dutch_stopwords = set(stopwords.words('dutch'))

# Preprocess text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in dutch_stopwords]
    # Join tokens back into a single string
    text = ' '.join(tokens)
    return text

# Replace the numerical labels with the sentiment categories
def map_labels(label):
    if label == 0:
        return "negative"
    elif label == 1:
        return "neutral"
    elif label == 2:
        return "positive"
    else:
        return "unknown"

# Create function to perform sentiment analysis and generate classification report and confusion matrix
def analyze_sentiment_and_report(dataset):
    # Preprocess text in the dataset
    dataset["preprocessed_text"] = dataset["text"].apply(preprocess_text)

    predicted_labels = []
    for text in dataset["preprocessed_text"]:
        polarity, _ = sentiment(text)
        predicted_label = "positive" if polarity > 0 else "negative" if polarity < 0 else "neutral" # Threshold for neutral is 0
        predicted_labels.append(predicted_label)

    # Map numerical ground truth labels to sentiment categories
    ground_truth_labels = dataset["labels"].apply(map_labels)

    # Create classification report
    report = classification_report(ground_truth_labels, predicted_labels)

    # Create confusion matrix
    conf_matrix = confusion_matrix(ground_truth_labels, predicted_labels, labels=["negative", "neutral", "positive"])

    return report, conf_matrix

# Paths to datasets
dataset_paths = ["1960s_gas.csv", "1970s_gas.csv",
                 "1980s_gas.csv", "1990s_gas.csv"]

# Iterate over each dataset path in the list
for dataset_path in dataset_paths:
    dataset_name = dataset_path.split(".")[0]
    dataset = pd.read_csv(dataset_path)
    print(f"Classification Report for {dataset_name}:")
    report, conf_matrix = analyze_sentiment_and_report(dataset)
    print(report)
    print("Confusion Matrix:")
    print(conf_matrix)
    print("-" * 50)  # Separating reports


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report for 1960s_gas:
              precision    recall  f1-score   support

    negative       0.24      0.18      0.21        82
     neutral       0.55      0.05      0.08       131
    positive       0.53      0.86      0.66       220

    accuracy                           0.49       433
   macro avg       0.44      0.36      0.32       433
weighted avg       0.48      0.49      0.40       433

Confusion Matrix:
[[ 15   0  67]
 [ 22   6 103]
 [ 25   5 190]]
--------------------------------------------------
Classification Report for 1970s_gas:
              precision    recall  f1-score   support

    negative       0.33      0.21      0.26        19
     neutral       0.00      0.00      0.00        22
    positive       0.58      0.87      0.70        55

    accuracy                           0.54        96
   macro avg       0.30      0.36      0.32        96
weighted avg       0.40      0.54      0.45        96

Confusion Matrix:
[[ 4  0 15]
 [ 2  0 20]
 [ 6  1

## LUPJE

In [6]:
!pip install nltk
!pip install scikit-learn



In [2]:
import pandas as pd
import re
import string
import nltk
from sklearn.metrics import classification_report

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Replace the numerical labels with the sentiment categories
def map_labels(label):
    if label == 0:
        return "negative"
    elif label == 1:
        return "neutral"
    elif label == 2:
        return "positive"
    else:
        return "unknown"

# List of Dutch stop words
dutch_stopwords = set(stopwords.words('dutch'))

# Preprocess text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in dutch_stopwords]
    # Join tokens back into a single string
    text = ' '.join(tokens)
    return text

# Read the text file into a DataFrame, skipping lines with incorrect formatting
words_sentiment_df = pd.DataFrame(columns=["word", "sentiment_score"])

with open("LUPJE.txt", "r") as file:
    for line in file:
        try:
            word, sentiment_score = line.strip().split("\t")
            words_sentiment_df = pd.concat([words_sentiment_df, pd.DataFrame({"word": [word], "sentiment_score": [sentiment_score]})])
        except ValueError:
            print(f"Skipping line with incorrect formatting: {line.strip()}")

# Convert sentiment scores to numeric type
words_sentiment_df["sentiment_score"] = pd.to_numeric(words_sentiment_df["sentiment_score"])

# Function that performs the sentiment analysis
def analyze_sentiment(text):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    # Tokenize the text
    tokens = preprocessed_text.split()
    sentiment_score = 0

    # Calculate sentiment score based on words in the text
    for token in tokens:
        if token in words_sentiment_df["word"].values:
            sentiment_score += words_sentiment_df.loc[words_sentiment_df["word"] == token, "sentiment_score"].values[0]

    # Determine sentiment label based on sentiment score
    if sentiment_score > 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"
    else:
        return "neutral"

# Perform sentiment analysis on the dataset
def perform_sentiment_analysis(dataset):
    # Map numerical labels to sentiment categories
    dataset["true_sentiment"] = dataset["labels"].map({0: "negative", 1: "neutral", 2: "positive"})

    # Apply sentiment analysis to the text column
    dataset["predicted_sentiment"] = dataset["text"].apply(analyze_sentiment)
    return dataset

# Function that performs sentiment analysis and generates a classification report for each dataset
def generate_classification_report(dataset):
    result_dataset = perform_sentiment_analysis(dataset)

    # Generate classification report
    report = classification_report(result_dataset["true_sentiment"], result_dataset["predicted_sentiment"])

    return report

# Paths to datasets
dataset_paths = ["1960s_gas.csv", "1970s_gas.csv",
                 "1980s_gas.csv", "1990s_gas.csv"]

# Iterate over each dataset path in the list
for dataset_path in dataset_paths:
    dataset_name = dataset_path.split(".")[0]
    your_dataset = pd.read_csv(dataset_path)
    print(f"Classification Report for {dataset_name}:")
    report = generate_classification_report(your_dataset)
    print(report)
    print("-" * 50)  # Separating reports


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report for 1960s_gas:
              precision    recall  f1-score   support

    negative       0.21      0.26      0.23        82
     neutral       0.42      0.08      0.14       131
    positive       0.53      0.74      0.62       220

    accuracy                           0.45       433
   macro avg       0.39      0.36      0.33       433
weighted avg       0.44      0.45      0.40       433

--------------------------------------------------
Classification Report for 1970s_gas:
              precision    recall  f1-score   support

    negative       0.14      0.21      0.17        19
     neutral       0.00      0.00      0.00        22
    positive       0.50      0.60      0.55        55

    accuracy                           0.39        96
   macro avg       0.21      0.27      0.24        96
weighted avg       0.31      0.39      0.35        96

--------------------------------------------------
Classification Report for 1980s_gas:
              precision  