<a href="https://colab.research.google.com/github/HiveCase/MachineLearningPractice/blob/main/Week8/MLP_GA8_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [12]:
df = pd.read_csv('/content/processed_data.csv')
df.head()

Unnamed: 0,text,sentiment,Time of Tweet,Age of User,Land Area (Km²),Continent,Density_Level,Population_Group
0,Sooo SAD I will miss you here in San Diego!!!,negative,noon,21-30,27400.0,EU,Medium,Medium
1,my boss is bullying me...,negative,night,31-45,2381740.0,AF,Low,Medium
2,what interview! leave me alone,negative,morning,46-60,470.0,EU,Medium,Low
3,"Sons of ****, why couldn`t they put them on t...",negative,noon,60-70,1246700.0,AF,Low,Medium
4,2am feedings for the baby are fun when he is a...,positive,morning,0-20,2736690.0,SA,Low,Medium


In [13]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=0)

In [14]:
all_text = ' '.join(train_set['text']).lower()
words = all_text.split()
unique_words = set(words)

In [15]:
len(unique_words)

26614

In [16]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [19]:
num_feature = ['Land Area (Km²)']
ord_features = ['Density_Level', 'Population_Group']
nom_features = ['Age of User', 'Time of Tweet', 'Continent']
text_feature = 'text'

# Ordinal mapping
ordinal_mapping = [['Low', 'Medium', 'High'], ['Low', 'Medium', 'High']]

# Define transformers
num_transformer = StandardScaler()
ord_transformer = OrdinalEncoder(categories=ordinal_mapping)
nom_transformer = OneHotEncoder(sparse_output=False, drop='first')
text_transformer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2),
    token_pattern=r'(?u)\b\w\w+\b|[@#]\w+',
    strip_accents='unicode'
)

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_feature),
        ('ord', ord_transformer, ord_features),
        ('nom', nom_transformer, nom_features),
        ('text', text_transformer, text_feature)
    ],
    remainder='drop'
)

# Fit on train and transform both train and test
X_train = preprocessor.fit_transform(train_set)
X_test = preprocessor.transform(test_set)

# Take first 5 rows of test set and calculate sum of all values (rounded to 2 decimal places)
result = np.sum(X_test[:5])
print("Answer:", round(result, 2))


Answer: 26.89


In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

# Redefine features (excluding 'Land Area (Km²)')
ord_features = ['Density_Level', 'Population_Group']
nom_features = ['Age of User', 'Time of Tweet', 'Continent']
text_feature = 'text'

# Ordinal encoding order
ordinal_mapping = [['Low', 'Medium', 'High'], ['Low', 'Medium', 'High']]

# Define transformers
ord_transformer = OrdinalEncoder(categories=ordinal_mapping)
nom_transformer = OneHotEncoder(sparse_output=False, drop='first')
text_transformer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2),
    token_pattern=r'(?u)\b\w\w+\b|[@#]\w+',
    strip_accents='unicode'
)

# ColumnTransformer excluding the numerical feature
preprocessor_nb = ColumnTransformer(
    transformers=[
        ('ord', ord_transformer, ord_features),
        ('nom', nom_transformer, nom_features),
        ('text', text_transformer, text_feature)
    ]
)

# Transform data
X_train_nb = preprocessor_nb.fit_transform(train_set)
X_test_nb = preprocessor_nb.transform(test_set)
y_train = train_set['sentiment']
y_test = test_set['sentiment']

# Train the MultinomialNB model
model = MultinomialNB()
model.fit(X_train_nb, y_train)

# Predict and calculate log loss
y_proba = model.predict_proba(X_test_nb)
loss = log_loss(y_test, y_proba)

# Output the result
print("Log Loss:", round(loss, 2))


Log Loss: 0.37


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import pandas as pd

# Train the Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Confusion matrix using correct lowercase labels
conf_matrix = confusion_matrix(y_test, y_pred, labels=['positive', 'negative'])
conf_df = pd.DataFrame(conf_matrix, index=['Actual_Pos', 'Actual_Neg'], columns=['Pred_Pos', 'Pred_Neg'])

# Display confusion matrix
print("Confusion Matrix:")
print(conf_df)

# Error Analysis
false_negative = conf_df.loc['Actual_Pos', 'Pred_Neg']
false_positive = conf_df.loc['Actual_Neg', 'Pred_Pos']

print("\nFalse Negatives:", false_negative)
print("False Positives:", false_positive)

if false_negative > false_positive:
    print("\n👉 Model is more confused about the **Positive** class.")
else:
    print("\n👉 Model is more confused about the **Negative** class.")


Confusion Matrix:
            Pred_Pos  Pred_Neg
Actual_Pos      1474       267
Actual_Neg       184      1348

False Negatives: 267
False Positives: 184

👉 Model is more confused about the **Positive** class.


In [22]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

# Initialize the estimator
estimator = LogisticRegression(random_state=42, max_iter=1000)

# Apply RFECV
selector = RFECV(estimator=estimator, step=100, n_jobs=-1, cv=5)
selector.fit(X_train, y_train)

# Number of features selected
selected_features = selector.support_.sum()
print("Number of selected features:", selected_features)


Number of selected features: 4216
