In [43]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import ParameterGrid

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
import mlflow
import dagshub


In [2]:
data_url = r'https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv'

df = pd.read_csv(data_url)

df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
# drop the tweet id column from the data

df.drop(columns=['tweet_id'],inplace=True)

df.columns


Index(['sentiment', 'content'], dtype='object')

In [4]:
# check for missing values

df.isna().sum()

sentiment    0
content      0
dtype: int64

In [5]:
# text preprocessing on data

nltk.download('wordnet')
nltk.download('stopwords')


def lemmatization(text):
    lemmatizer= WordNetLemmatizer()

    text = text.split()

    text=[lemmatizer.lemmatize(y) for y in text]

    return " " .join(text)


def remove_stop_words(text):
    stop_words = set(stopwords.words("english"))
    Text=[i for i in str(text).split() if i not in stop_words]
    return " ".join(Text)


def removing_numbers(text):
    text=''.join([i for i in text if not i.isdigit()])
    return text


def lower_case(text):

    text = text.split()

    text=[y.lower() for y in text]

    return " " .join(text)


def removing_punctuations(text):
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()


def removing_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def remove_small_sentences(df):
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[i] = np.nan


def normalize_text(df):
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : lower_case(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : remove_stop_words(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : removing_numbers(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : removing_punctuations(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : removing_urls(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : lemmatization(x))
    return df

  text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
  text = re.sub('\s+', ' ', text)
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\himan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\himan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# normalize the df

df_processed = normalize_text(df)

df_processed.head()

Unnamed: 0,sentiment,content
0,empty,tiffanylue know listenin bad habit earlier sta...
1,sadness,layin n bed headache ughhhh waitin call
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,want hang friend soon
4,neutral,dannycastillo want trade someone houston ticke...


In [7]:
# check missing values in the processed data

df_processed.isna().sum()

sentiment    0
content      0
dtype: int64

In [7]:
# value counts for target feature

# classes_to_filter = (
#     df_processed
#     .loc[:,'sentiment']
#     .value_counts(normalize=True)
#     .loc[lambda ser: ser.ge(0.2)]
#     .index
#     .tolist()
# )

# classes_to_filter

['neutral', 'worry']

In [10]:
# value counts of the target
(
    df_processed['sentiment']
    .value_counts()
    .index[2:4]
    .tolist()
)

['happiness', 'sadness']

In [11]:
# classes to filter

classes_to_filter = ['happiness', 'sadness']

In [12]:
df_processed = (
                    df_processed
                    .loc[df_processed['sentiment'].isin(classes_to_filter),:]
                )

In [13]:
# make X and y

X = df_processed.drop(columns=['sentiment']).squeeze()
y = df_processed['sentiment']

X

1                  layin n bed headache ughhhh waitin call
2                           funeral ceremony gloomy friday
6        sleep im not thinking old friend want he s mar...
8                             charviray charlene love miss
9                          kelcouch i m sorry least friday
                               ...                        
39986                going watch boy striped pj s hope cry
39987    gave bike thorough wash degrease grease it thi...
39988             amazing time last night mcfly incredible
39994                          succesfully following tayla
39998    niariley wassup beautiful follow me peep new h...
Name: content, Length: 10374, dtype: object

In [14]:
y.value_counts()

sentiment
happiness    5209
sadness      5165
Name: count, dtype: int64

In [15]:
# train test split the data

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print('The shape of X_train is ',X_train.shape)
print('The shape of X_test is ',X_test.shape)

The shape of X_train is  (8299,)
The shape of X_test is  (2075,)


In [16]:
# check for missing values in X_train

X_train.isna().sum()

0

In [17]:
# check for missing values in X_test

X_test.isna().sum()

0

In [18]:
# label encode the target

le = LabelEncoder()

# fit transform the y train
y_train_trans = le.fit_transform(y_train)

# transform the y test
y_test_trans = le.transform(y_test)

In [19]:
y_train_trans

array([1, 1, 1, ..., 0, 1, 1])

In [20]:
# class encodings

le.classes_

array(['happiness', 'sadness'], dtype=object)

In [21]:
bow = CountVectorizer(max_features=1000)

log_reg = LogisticRegression()

log_reg.fit(bow.fit_transform(X_train),y_train_trans)

In [22]:
# model pipeline

model_pipe = Pipeline(steps=[
    ('preprocess',CountVectorizer()),
    ('clf',LogisticRegression())
])

model_pipe

In [23]:
# params grid to test the base estimators

params_grid = {
    'preprocess': [CountVectorizer(max_features=15000),TfidfVectorizer(max_features=15000)],
    'clf': [RandomForestClassifier(),
            LogisticRegression(),
            GradientBoostingClassifier(),
            XGBClassifier(),
            MultinomialNB()]
}

In [24]:
X_train.shape

(8299,)

In [36]:
y_train

23531      sadness
8051       sadness
11499      sadness
31288    happiness
18561      sadness
           ...    
21697    happiness
19445      sadness
20216    happiness
3258       sadness
27810      sadness
Name: sentiment, Length: 8299, dtype: object

In [25]:
y_train_trans.shape

(8299,)

In [26]:
# make the grid search object

grid_search = GridSearchCV(model_pipe,params_grid,
                           scoring=['accuracy','f1'],cv=3,
                           n_jobs=-1,verbose=3,
                           return_train_score=True,
                           refit='f1')

# fit the grid search object
grid_search.fit(X_train,y_train_trans)

# set the cv results
results = grid_search.cv_results_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [27]:
# keys of the grid search results

results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_clf', 'param_preprocess', 'params', 'split0_test_accuracy', 'split1_test_accuracy', 'split2_test_accuracy', 'mean_test_accuracy', 'std_test_accuracy', 'rank_test_accuracy', 'split0_train_accuracy', 'split1_train_accuracy', 'split2_train_accuracy', 'mean_train_accuracy', 'std_train_accuracy', 'split0_test_f1', 'split1_test_f1', 'split2_test_f1', 'mean_test_f1', 'std_test_f1', 'rank_test_f1', 'split0_train_f1', 'split1_train_f1', 'split2_train_f1', 'mean_train_f1', 'std_train_f1'])

In [33]:
# train scores for the results

results['mean_train_accuracy']

array([0.99975902, 0.99975902, 0.96089898, 0.91143543, 0.76159815,
       0.76575526, 0.83823423, 0.8686592 , 0.94047485, 0.94011324])

In [29]:
# mean test scores from grid search

results['mean_test_accuracy']

array([0.7684071 , 0.76937115, 0.78708338, 0.7866013 , 0.7304509 ,
       0.72996924, 0.76780446, 0.75708112, 0.77768401, 0.77756406])

In [30]:
results['mean_test_f1']

array([0.76983937, 0.76773815, 0.78565234, 0.78342724, 0.75874218,
       0.75984497, 0.7586014 , 0.7461784 , 0.77788799, 0.77721375])

In [None]:
results['params'][0]['clf'].__class__.__name__

'RandomForestClassifier'

In [32]:
results['params']

[{'clf': RandomForestClassifier(),
  'preprocess': CountVectorizer(max_features=15000)},
 {'clf': RandomForestClassifier(),
  'preprocess': TfidfVectorizer(max_features=15000)},
 {'clf': LogisticRegression(),
  'preprocess': CountVectorizer(max_features=15000)},
 {'clf': LogisticRegression(),
  'preprocess': TfidfVectorizer(max_features=15000)},
 {'clf': GradientBoostingClassifier(),
  'preprocess': CountVectorizer(max_features=15000)},
 {'clf': GradientBoostingClassifier(),
  'preprocess': TfidfVectorizer(max_features=15000)},
 {'clf': XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=None, max_bin=None,
                max_cat_t

In [34]:
grid_search.best_score_

0.7856523447417731

In [35]:
grid_search.best_params_

{'clf': LogisticRegression(),
 'preprocess': CountVectorizer(max_features=15000)}

In [41]:
# get the train and test df

train_df, test_df = train_test_split(df_processed,test_size=0.2,random_state=42)

mlflow.data.from_pandas(train_df)

train_df

Unnamed: 0,sentiment,content
23531,sadness,quot my problem miss you cause don t quot
8051,sadness,that s it done already one proof there s nothi...
11499,sadness,hungry food steal
31288,happiness,foot hurt finally bed will forget crunch over ...
18561,sadness,really ill atm
...,...,...
21697,happiness,chocolatesuze yes yes should especially wine m...
19445,sadness,kickzfadayz boy better get tonight
20216,happiness,tafe actually quite good
3258,sadness,minute boarding hour home window seat


## Experimentation Tracking for Base Estimators

In [47]:
X_test.shape

(2075,)

In [46]:
# get the predictions on test data

y_pred = grid_search.best_estimator_.predict(X_test)

y_pred

array([1, 1, 1, ..., 0, 1, 0])

In [49]:
# set the tracking uri
mlflow.set_tracking_uri("https://dagshub.com/himanshu1703/mlops-mini-project.mlflow")

# initialize dagshub
dagshub.init(repo_owner='himanshu1703', repo_name='mlops-mini-project', mlflow=True)

# set the experiment name
mlflow.set_experiment("Baseline Models")

# start the mlflow tracking
with mlflow.start_run(run_name='best_base_model') as parent:
    
    for ind in range(len(ParameterGrid(params_grid))):
        # log the child runs
        # create run name
        run_name = f"{results['params'][ind]['preprocess'].__class__.__name__} with {results['params'][ind]['clf'].__class__.__name__}"
        
        with mlflow.start_run(run_name=run_name,nested=True) as child:
            # log the parameters
            mlflow.log_param("Preprocessor", results['params'][ind]['preprocess'].__class__.__name__)
            mlflow.log_param("Classifier", results['params'][ind]['clf'].__class__.__name__)
            
            train_metrics = {
                'train_accuracy': results["mean_train_accuracy"][ind],
                "train_f1": results["mean_train_f1"][ind]
            }
            
            test_metrics = {
                'test_accuracy': results["mean_test_accuracy"][ind],
                "test_f1": results["mean_test_f1"][ind]
            }
            
            # log the train metrics
            mlflow.log_metrics(train_metrics)
            
            # log the test metrics
            mlflow.log_metrics(test_metrics)
    
    # best model
    best_model = grid_search.best_estimator_
    
    # best score
    best_model_score = grid_search.best_score_
    
    # best parameters
    best_params = grid_search.best_params_
    
    # log the training data
    mlflow.log_input(mlflow.data.from_pandas(train_df),context='training')
    
    # log the test data
    mlflow.log_input(mlflow.data.from_pandas(test_df),context='validation')
    
    # get the model signature
    signature = mlflow.models.infer_signature(model_input=X_train,model_output=best_model.predict(X_test)) 
           
    # log the best model
    mlflow.sklearn.log_model(sk_model=best_model,
                             artifact_path= "best_baseline_model",
                             signature=signature)
    
    # log the metrics
    best_metrics = {
        'accuracy': accuracy_score(y_test_trans,y_pred),
        'f1': best_model_score
    }
    
    mlflow.log_metrics(best_metrics)
    
    # log the best parameters
    mlflow.log_params(best_params)