In [None]:
import pandas as pd
import re
import spacy
import random
import numpy as np
!pip install langdetect
from langdetect import detect
import langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/981.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m901.1/981.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=fcdd8fe4aa6aa7d8ea9b7e1738417706a5f0f56bf1809ed3e7187c9f07e9e2e3
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# load csv into pandas
df = pd.read_csv("output.csv")
df = df.sample(n=30000, random_state=42)
df = df.rename(columns={'content': 'review', 'is_positive': 'recommended'})
df['recommended'] = df['recommended'] == 'Positive'
# remove columns we're not interested in
df = df[['review', 'recommended']]
df.head()
print(df['recommended'].value_counts())



recommended
True     15367
False    14633
Name: count, dtype: int64


In [None]:
# next filter by english only
def is_english(text):
    try:
        return detect(text) == 'en'
    except (langdetect.lang_detect_exception.LangDetectException, TypeError):
        return False

df['is_english'] = df['review'].apply(is_english)
df = df[df['is_english']]
df = df.drop('is_english', axis=1)

# print values to see if classes are balanced
print(df['recommended'].value_counts())

recommended
False    10755
True     10392
Name: count, dtype: int64


In [None]:
# count presence of all caps and exclamation marks

def count_exclamation_marks(text):
    if pd.isna(text):
        return 0
    return text.count('!')

def count_caps_words(text):
    if pd.isna(text):
        return 0
    words = re.findall(r'\b[A-Za-z]+\b', text)
    caps_count = sum(1 for word in words if word.isupper() and len(word) >= 2)
    return caps_count

df['exclamation_count'] = df['review'].apply(count_exclamation_marks)
df['all_caps_count'] = df['review'].apply(count_caps_words)

df.head()

Unnamed: 0,review,recommended,exclamation_count,all_caps_count
95810,"Goodbye, I will miss you!",True,1,0
66240,pretty shit tbh for the sole reason that no on...,True,0,0
152217,It's genuinely so impressive how well this gam...,True,0,0
11713,It's Counterstrike therefore it sucks.,False,0,0
50076,Casual is literally unplayable at all times of...,False,0,0


In [None]:
# tokenize and lemmatize
nlp = spacy.load("en_core_web_sm")
def lemmatize_text(text):
    if pd.isna(text):
        return ""
    doc = nlp(text)
    # spacy automatically tokenizes before lemmatizing, so no need to tokenize here
    # we can also remove punctuation straight in one go
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

df['lemmatized_review'] = df['review'].apply(lemmatize_text)



In [None]:
# count profanity using lemmatized text

profanity_list = ['shit', 'fuck', 'crap']
def count_profanity(text):
  if pd.isna(text):
        return 0
  pattern = r'\b(' + '|'.join(profanity_list) + r')\b'
  matches = re.findall(pattern, text)
  return len(matches)

df['profanity_counter'] = df['lemmatized_review'].apply(count_profanity)
df.head()

Unnamed: 0,review,recommended,exclamation_count,all_caps_count,lemmatized_review,profanity_counter
95810,"Goodbye, I will miss you!",True,1,0,goodbye I will miss you,0
66240,pretty shit tbh for the sole reason that no on...,True,0,0,pretty shit tbh for the sole reason that no on...,1
152217,It's genuinely so impressive how well this gam...,True,0,0,it be genuinely so impressive how well this ga...,0
11713,It's Counterstrike therefore it sucks.,False,0,0,it be Counterstrike therefore it suck,0
50076,Casual is literally unplayable at all times of...,False,0,0,Casual be literally unplayable at all time of ...,0


In [None]:
discourse_markers = {
    # Contrast/Concession Markers (typically signal negative)
    "nevertheless": -0.7,
    "nonetheless": -0.7,
    "however": -0.6,
    "despite": -0.5,
    "although": -0.5,
    "yet": -0.6,
    "even though": -0.5,
    "in spite of": -0.5,
    "but": -0.4,


    # Additive/Amplification Markers (typically signal positive)
    "moreover": 0.6,
    "furthermore": 0.6,
    "in addition": 0.5,
    "not only but also": 0.7,
    "indeed": 0.7,
    "plus": 0.5,

    # Resultative Markers (can go either way, but let's say slightly positive default)
    "therefore": 0.3,
    "consequently": 0.3,
    "as a result": 0.2,
    "thus": 0.3,

    # temporal Progression Markers (slightly negative default)
    "initially": -0.2,
    "at first": -0.3,
    "after a while": -0.5,
    "eventually": -0.4,
    "over time": -0.3,

    # other types of markers:
    "on the other hand": -0.5,
    "to be honest": -0.5,
    "fortunately": 0.7,
    "unfortunately": -0.8,
    "slightly better": -0.2,
    "without a doubt": 0.3,

    # not discourse markers but common words:
    "trash": -0.9,
    "fun": 0.5,

}

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC


class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return data[self.key]

class NumericalFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.feature_names].values

    def get_feature_names_out(self):
        return self.feature_names

class DiscourseMarkerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, markers_dict):
        self.markers_dict = markers_dict

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features = np.zeros((len(X), len(self.markers_dict) + 1))  # +1 for weighted sum

        for i, text in enumerate(X):
            if pd.isna(text):
                continue

            text = text.lower()

            weighted_sum = 0

            for j, (marker, weight) in enumerate(self.markers_dict.items()):
                pattern = r'\b' + re.escape(marker) + r'\b'
                count = len(re.findall(pattern, text))
                features[i, j] = count
                weighted_sum += count * weight

            features[i, -1] = weighted_sum

        return features

def build_sentiment_classifier():

    text_pipeline = Pipeline([
        ('selector', ItemSelector('lemmatized_review')),
        ('tfidf', TfidfVectorizer(max_features=1000, min_df=5, ngram_range=(1, 2)))
    ])

    discourse_pipeline = Pipeline([
        ('selector', ItemSelector('lemmatized_review')),
        ('discourse', DiscourseMarkerTransformer(discourse_markers))
    ])

    numerical_pipeline = Pipeline([
        ('numerical', NumericalFeaturesTransformer(['profanity_counter', 'exclamation_count', 'all_caps_count']))
    ])

    features = FeatureUnion([
        ('text', text_pipeline),
        ('discourse', discourse_pipeline),
        ('numerical', numerical_pipeline)
    ])


    classifier = SVC(kernel='linear', C=1.0, probability=True, class_weight='balanced')


    pipeline = Pipeline([
        ('features', features),
        ('classifier', classifier)
    ])

    return pipeline

def add_marker_columns(df, text_column='lemmatized_review'):
    for marker in discourse_markers.keys():
        column_name = f'marker_{marker.replace(" ", "_")}'

        if " " in marker:
            df[column_name] = df[text_column].apply(
                lambda x: 0 if pd.isna(x) else x.lower().count(marker)
            )
        else:
            df[column_name] = df[text_column].apply(
                lambda x: 0 if pd.isna(x) else len(re.findall(r'\b' + re.escape(marker) + r'\b', x.lower()))
            )

    df['marker_weighted_sum'] = 0
    for marker, weight in discourse_markers.items():
        column_name = f'marker_{marker.replace(" ", "_")}'
        df['marker_weighted_sum'] += df[column_name] * weight

    return df

In [None]:
df = add_marker_columns(df)
X = df[['lemmatized_review', 'profanity_counter', 'exclamation_count', 'all_caps_count']]
y = df['recommended']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


model = build_sentiment_classifier()
model.fit(X_train, y_train)

# evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

       False       0.85      0.81      0.83      3192
        True       0.82      0.85      0.83      3153

    accuracy                           0.83      6345
   macro avg       0.83      0.83      0.83      6345
weighted avg       0.83      0.83      0.83      6345



In [None]:
# Check if markers exist in dataset and have variance
positive_avg = df[df['recommended'] == True]['profanity_counter'].mean()
negative_avg = df[df['recommended'] == False]['profanity_counter'].mean()
print(f"Average profanity in positive reviews: {positive_avg:.3f}")
print(f"Average profanity in negative reviews: {negative_avg:.3f}")

caps_positive = df[df['recommended'] == True]['all_caps_count'].mean()
caps_negative = df[df['recommended'] == False]['all_caps_count'].mean()

print(f"Average # of all caps in positive reviews: {caps_positive:.3f}")
print(f"Average # of all caps in negative reviews: {caps_negative:.3f}")

exclamation_positive = df[df['recommended'] == True]['exclamation_count'].mean()
exclamation_negative = df[df['recommended'] == False]['exclamation_count'].mean()

print(f"Average # of exclamation marks in positive reviews: {exclamation_positive:.3f}")
print(f"Average # of exclamation marks in negative reviews: {exclamation_negative:.3f}")

marker_stats = {}
for marker in discourse_markers.keys():
    column_name = f'marker_{marker.replace(" ", "_")}'

    # Check if column exists
    if column_name not in df.columns:
        print(f"Warning: Column {column_name} does not exist")
        continue

    # Check marker frequency
    count = df[column_name].sum()

    # Calculate correlation only if the marker appears at least once
    if count > 0:
        correlation = df[column_name].corr(df['recommended'].astype(int))
        marker_stats[marker] = {
            'count': count,
            'frequency': count / len(df),
            'correlation': correlation,
            'manual_weight': discourse_markers[marker]
        }
    else:
        marker_stats[marker] = {
            'count': 0,
            'frequency': 0,
            'correlation': None,
            'manual_weight': discourse_markers[marker]
        }

# Display results
print("\nMarker Statistics:")
for marker, stats in marker_stats.items():
    if stats['count'] > 0:
        print(f"{marker}: appears {stats['count']} times ({stats['frequency']:.1%}), correlation: {stats['correlation']:.3f}, manual weight: {stats['manual_weight']:.1f}")
    else:
        print(f"{marker}: does not appear in dataset")

Average profanity in positive reviews: 0.019
Average profanity in negative reviews: 0.104
Average # of all caps in positive reviews: 0.492
Average # of all caps in negative reviews: 1.013
Average # of exclamation marks in positive reviews: 0.322
Average # of exclamation marks in negative reviews: 0.325

Marker Statistics:
nevertheless: appears 11 times (0.1%), correlation: 0.011, manual weight: -0.7
nonetheless: appears 12 times (0.1%), correlation: 0.004, manual weight: -0.7
however: appears 316 times (1.5%), correlation: -0.012, manual weight: -0.6
despite: appears 176 times (0.8%), correlation: -0.007, manual weight: -0.5
although: appears 154 times (0.7%), correlation: 0.023, manual weight: -0.5
yet: appears 286 times (1.4%), correlation: -0.021, manual weight: -0.6
even though: appears 129 times (0.6%), correlation: -0.012, manual weight: -0.5
in spite of: appears 7 times (0.0%), correlation: -0.013, manual weight: -0.5
but: appears 6273 times (29.7%), correlation: -0.027, manual 