<a href="https://colab.research.google.com/github/KartikayBhardwaj-dev/COLLEGE/blob/main/week8_graded_1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression


In [None]:
df = pd.read_csv('/content/processed_data.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16363 entries, 0 to 16362
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   text              16363 non-null  object 
 1   sentiment         16363 non-null  object 
 2   Time of Tweet     16363 non-null  object 
 3   Age of User       16363 non-null  object 
 4   Land Area (Km²)   16363 non-null  float64
 5   Continent         14416 non-null  object 
 6   Density_Level     16363 non-null  object 
 7   Population_Group  16363 non-null  object 
dtypes: float64(1), object(7)
memory usage: 1022.8+ KB


In [None]:
X_train_df , X_test_df = train_test_split(df,test_size=0.2,random_state=0)

In [None]:
all_text = " ".join(X_train['text']).lower()
words = re.findall(r'\b\w+\b', all_text)
unique_words = set(words)
print(len(unique_words))

16446


In [None]:
X_train['text'].unique()

array([' come home, then. Not so boring here.',
       'Is feeling like he has a bad flu. Yes. Bad. Flu.',
       ' have a lok at EF too! they are jummy', ...,
       'Happy Mothers Day! Will be going out later at 6 pm to watch a well renowned group of singers!',
       'ummm sooo yeh....its really hard to concentrate rite now wen i have this weird #lupus feeling goin thro my body',
       'Good morning Tweeple of the sun! What you all up to?'],
      dtype=object)

PREPROCESSING
- Apply preprocessing to the features of both the training and test datasets as follows:

- For numerical Feature (Land Area (Km²))
  * scale using StandardScaler.

- For ordinal feautres (Density_Level, Population_Group)
  * Apply OrdinalEncoder with the following category mapping: "Low" → 0, "Medium" → 1, "High" → 2.

- For nominal features (Age of User, Time of Tweet, Continent)
  * Use OneHotEncoder with sparse_ouput=False and drop=first

- For text features (text)
  * Apply TfidfVectorizer with the following parameters:

    * lowercase=True
    * stop_words=english
    * max_features=5000
    * ngram_range=(1, 2)
    * token_pattern=r(?u)\b\w\w+\b|[@#]\w+ (to include hashtags and mentions)
    * strip_accents=unicode (to normalize characters like "é")

In [None]:
ct = ColumnTransformer([
    ('numerical' , StandardScaler() , ['Land Area (Km²)']),
    ('ordinal' , OrdinalEncoder(categories=[["Low","Medium","High"],["Low", "Medium", "High"]]),['Density_Level', 'Population_Group']),
    ('nominal' , OneHotEncoder(sparse_output=False,drop='first'),['Age of User','Time of Tweet','Continent']),
    ('text' , TfidfVectorizer(
        lowercase=True,
        stop_words='english',
        max_features=5000,
        ngram_range=(1, 2),
        token_pattern=r'(?u)\b\w\w+\b|[@#]\w+',
        strip_accents='unicode'
    ),'text')
])

In [None]:
X_train= ct.fit_transform(X_train_df)
X_test =ct.transform(X_test_df)

In [None]:
X_test[:5].sum()

np.float64(26.885773869406624)

MODEL BUILDING

In [None]:
# Redefine features (excluding 'Land Area (Km²)')
ord_features = ['Density_Level', 'Population_Group']
nom_features = ['Age of User', 'Time of Tweet', 'Continent']
text_feature = 'text'

# Ordinal encoding order
ordinal_mapping = [['Low', 'Medium', 'High'], ['Low', 'Medium', 'High']]

# Define transformers
ord_transformer = OrdinalEncoder(categories=ordinal_mapping)
nom_transformer = OneHotEncoder(sparse_output=False, drop='first')
text_transformer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2),
    token_pattern=r'(?u)\b\w\w+\b|[@#]\w+',
    strip_accents='unicode'
)

# ColumnTransformer excluding the numerical feature
preprocessor_nb = ColumnTransformer(
    transformers=[
        ('ord', ord_transformer, ord_features),
        ('nom', nom_transformer, nom_features),
        ('text', text_transformer, text_feature)
    ]
)

# Transform data
X_train_nb = preprocessor_nb.fit_transform(X_train_df)
X_test_nb = preprocessor_nb.transform(X_test_df)
y_train = X_train_df['sentiment']
y_test = X_test_df['sentiment']

# Train the MultinomialNB model
model = MultinomialNB()
model.fit(X_train_nb, y_train)

# Predict and calculate log loss
y_proba = model.predict_proba(X_test_nb)
loss = log_loss(y_test, y_proba)

# Output the result
print("Log Loss:", round(loss, 2))

Log Loss: 0.37


ERROR ANALYSIS
- Train a RandomForestClassifier with random_state=42 on the preprocessed training dataset. This time, include all features, including Land Area (Km²), in both the training and test datasets.

In [None]:
# Redefine features (excluding 'Land Area (Km²)')
num_feature = ['Land Area (Km²)']
ord_features = ['Density_Level', 'Population_Group']
nom_features = ['Age of User', 'Time of Tweet', 'Continent']
text_feature = 'text'

# Ordinal encoding order
ordinal_mapping = [['Low', 'Medium', 'High'], ['Low', 'Medium', 'High']]

# Define transformers
num_transformer = StandardScaler()
ord_transformer = OrdinalEncoder(categories=ordinal_mapping)
nom_transformer = OneHotEncoder(sparse_output=False, drop='first')
text_transformer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2),
    token_pattern=r'(?u)\b\w\w+\b|[@#]\w+',
    strip_accents='unicode'
)

# ColumnTransformer excluding the numerical feature
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_feature),
        ('ord', ord_transformer, ord_features),
        ('nom', nom_transformer, nom_features),
        ('text', text_transformer, text_feature)
    ]
)
X_train = preprocessor.fit_transform(X_train_df)
X_test = preprocessor.transform(X_test_df)
y_train = X_train_df['sentiment']
y_test = X_test_df['sentiment']

In [64]:
model2 = RandomForestClassifier(random_state=42)
model2.fit(X_train,y_train)
y_pred = model2.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred, labels=['positive', 'negative'])
conf_df = pd.DataFrame(conf_matrix, index=['Actual_Pos', 'Actual_Neg'], columns=['Pred_Pos', 'Pred_Neg'])

# Display confusion matrix
print("Confusion Matrix:")
print(conf_df)


Confusion Matrix:
            Pred_Pos  Pred_Neg
Actual_Pos      1474       267
Actual_Neg       184      1348


In [68]:
estimator = LogisticRegression(random_state=42,max_iter=1000)
selector = RFECV(estimator=estimator,step=100,n_jobs=-1,cv=5)
selector.fit(X_train,y_train)
selected_features = selector.support_.sum()

In [69]:
selected_features

np.int64(4216)