In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import ParameterGrid

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
import mlflow
import dagshub


In [3]:
data_url = r'https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv'

df = pd.read_csv(data_url)

df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
# drop the tweet id column from the data

df.drop(columns=['tweet_id'],inplace=True)

df.columns


Index(['sentiment', 'content'], dtype='object')

In [5]:
# text preprocessing on data

nltk.download('wordnet')
nltk.download('stopwords')


def lemmatization(text):
    lemmatizer= WordNetLemmatizer()

    text = text.split()

    text=[lemmatizer.lemmatize(y) for y in text]

    return " " .join(text)


def remove_stop_words(text):
    stop_words = set(stopwords.words("english"))
    Text=[i for i in str(text).split() if i not in stop_words]
    return " ".join(Text)


def removing_numbers(text):
    text=''.join([i for i in text if not i.isdigit()])
    return text


def lower_case(text):

    text = text.split()

    text=[y.lower() for y in text]

    return " " .join(text)


def removing_punctuations(text):
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()


def removing_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def remove_small_sentences(df):
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[i] = np.nan


def normalize_text(df):
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : lower_case(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : remove_stop_words(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : removing_numbers(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : removing_punctuations(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : removing_urls(x))
    df.loc[:,'content'] = df.loc[:,'content'].apply(lambda x : lemmatization(x))
    return df

  text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
  text = re.sub('\s+', ' ', text)
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\himan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\himan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# normalize the df

df_processed = normalize_text(df)

df_processed.head()

Unnamed: 0,sentiment,content
0,empty,tiffanylue know listenin bad habit earlier sta...
1,sadness,layin n bed headache ughhhh waitin call
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,want hang friend soon
4,neutral,dannycastillo want trade someone houston ticke...


In [7]:
# check missing values in the processed data

df_processed.isna().sum()

sentiment    0
content      0
dtype: int64

In [8]:
# classes to filter

classes_to_filter = ['happiness', 'sadness']

In [9]:
df_processed = (
                    df_processed
                    .loc[df_processed['sentiment'].isin(classes_to_filter),:]
                )

In [10]:
# make X and y

X = df_processed.drop(columns=['sentiment']).squeeze()
y = df_processed['sentiment']

X

1                  layin n bed headache ughhhh waitin call
2                           funeral ceremony gloomy friday
6        sleep im not thinking old friend want he s mar...
8                             charviray charlene love miss
9                          kelcouch i m sorry least friday
                               ...                        
39986                going watch boy striped pj s hope cry
39987    gave bike thorough wash degrease grease it thi...
39988             amazing time last night mcfly incredible
39994                          succesfully following tayla
39998    niariley wassup beautiful follow me peep new h...
Name: content, Length: 10374, dtype: object

In [11]:
y.value_counts()

sentiment
happiness    5209
sadness      5165
Name: count, dtype: int64

In [12]:
# train test split the data

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print('The shape of X_train is ',X_train.shape)
print('The shape of X_test is ',X_test.shape)

The shape of X_train is  (8299,)
The shape of X_test is  (2075,)


In [13]:
# check for missing values in X_train

X_train.isna().sum()

0

In [14]:
# label encode the target

le = LabelEncoder()

# fit transform the y train
y_train_trans = le.fit_transform(y_train)

# transform the y test
y_test_trans = le.transform(y_test)

In [15]:
y_train_trans

array([1, 1, 1, ..., 0, 1, 1])

In [45]:
# model pipeline

model_pipe = Pipeline(steps=[
    ('preprocess',CountVectorizer()),
    ('clf',LogisticRegression())
])

model_pipe

In [46]:
# params grid to test the base estimators

params_grid = {
    "preprocess__max_features" : [None, 20000, 15000, 10000],
    "clf__C": list(np.round(np.arange(0.1,2.1,0.1),2)),
    "clf__penalty": ['l1','l2'],
    "clf__solver": ['liblinear','saga']
}

In [48]:
# make the grid search object

grid_search = GridSearchCV(model_pipe,params_grid,
                           scoring=['accuracy','f1'],cv=5,
                           n_jobs=-1,verbose=2,
                           return_train_score=True,
                           refit='f1')

# fit the grid search object
grid_search.fit(X_train,y_train_trans)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


In [49]:
# set the cv results
results = grid_search.cv_results_

In [50]:
# get the best estimator

best_model = grid_search.best_estimator_


In [51]:
# get the best parameters

best_params = grid_search.best_params_

best_params

{'clf__C': 0.6,
 'clf__penalty': 'l2',
 'clf__solver': 'liblinear',
 'preprocess__max_features': None}

In [52]:
# get the best score

best_score = grid_search.best_score_

best_score

0.7888419442017314

In [53]:
# get the train and test df

train_df, test_df = train_test_split(df_processed,test_size=0.2,random_state=42)

mlflow.data.from_pandas(train_df)

train_df

Unnamed: 0,sentiment,content
23531,sadness,quot my problem miss you cause don t quot
8051,sadness,that s it done already one proof there s nothi...
11499,sadness,hungry food steal
31288,happiness,foot hurt finally bed will forget crunch over ...
18561,sadness,really ill atm
...,...,...
21697,happiness,chocolatesuze yes yes should especially wine m...
19445,sadness,kickzfadayz boy better get tonight
20216,happiness,tafe actually quite good
3258,sadness,minute boarding hour home window seat


In [54]:
# get the predictions on test data

y_pred = grid_search.best_estimator_.predict(X_test)

y_pred

array([1, 1, 1, ..., 0, 1, 0])

In [56]:
# set the tracking uri
mlflow.set_tracking_uri("https://dagshub.com/himanshu1703/mlops-mini-project.mlflow")

# initialize dagshub
dagshub.init(repo_owner='himanshu1703', repo_name='mlops-mini-project', mlflow=True)

# set the experiment name
mlflow.set_experiment("HyperParameter Tuning")

# start the mlflow tracking
with mlflow.start_run(run_name='best_base_model') as parent:
    
    for ind in range(len(ParameterGrid(params_grid))):
        # log the child runs
        
        with mlflow.start_run(nested=True) as child:
            # log the parameters
            mlflow.log_params(results['params'][ind])
            
            train_metrics = {
                'train_accuracy': results["mean_train_accuracy"][ind],
                "train_f1": results["mean_train_f1"][ind]
            }
            
            test_metrics = {
                'test_accuracy': results["mean_test_accuracy"][ind],
                "test_f1": results["mean_test_f1"][ind]
            }
            
            # log the train metrics
            mlflow.log_metrics(train_metrics)
            
            # log the test metrics
            mlflow.log_metrics(test_metrics)
    
    
    # log the training data
    mlflow.log_input(mlflow.data.from_pandas(train_df),context='training')
    
    # log the test data
    mlflow.log_input(mlflow.data.from_pandas(test_df),context='validation')
    
    # get the model signature
    signature = mlflow.models.infer_signature(model_input=X_train,
                                              model_output=best_model.predict(X_test)) 
           
    # log the best model
    mlflow.sklearn.log_model(sk_model=best_model,
                             artifact_path= "best_baseline_model",
                             signature=signature)
    
    # log the metrics
    best_metrics = {
        'accuracy': accuracy_score(y_test_trans,y_pred),
        'f1': best_score
    }
    
    mlflow.log_metrics(best_metrics)
    
    # log the best parameters
    mlflow.log_params(best_params)