# Model training for mlops

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score
import mlflow
from mlflow.models import infer_signature
import mlflow.spark

In [0]:
df = spark.read.table(
    "mlopsig2i.default.tweets_silver"
)
pdf = df.toPandas()
display(df)

count,hate_speech,offensive_language,neither,class,tweet,text,text_len,label
3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. & as a man you should always take the trash out...,!!! : as a woman you shouldn't complain about cleaning up your house. & as a man you should always take the trash out...,125,2.0
3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!,!!!!! : boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!,75,1.0
3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit,!!!!!!! dawg!!!! : you ever fuck a bitch and she start to cry? you be confused as shit,89,1.0
3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny,!!!!!!!!! : she look like a tranny,36,1.0
6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya ,!!!!!!!!!!!!! : the shit you hear about me might be true or it might be faker than the bitch who told it to ya ,120,1.0
3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! 😂😂😂""","!!!!!!!!!!!!!!!!!!"": the shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! 😂😂😂""",146,1.0
3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!""","!!!!!!"": i can not just sit up and hate on another bitch .. i got too much shit going on!""",90,1.0
3,0,3,0,1,!!!!“@selfiequeenbri: cause I'm tired of you big bitches coming for us skinny girls!!”,!!!!“: cause i'm tired of you big bitches coming for us skinny girls!!”,83,1.0
3,0,3,0,1,""" & you might not get ya bitch back & thats that """,""" & you might not get ya bitch back & thats that """,58,1.0
3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Mariam""",""" :hobbies include: fighting mariam""",38,1.0


In [0]:
pdf["class"].value_counts()

class
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [0]:
min_count = pdf["label"].value_counts().min()
balanced_pdf = pd.concat([
    pdf[pdf["label"] == 0].sample(n=min_count, random_state=42),
    pdf[pdf["label"] == 1].sample(n=min_count, random_state=42),
    pdf[pdf["label"] == 2].sample(n=min_count, random_state=42)
])
balanced_pdf = balanced_pdf.sample(frac=1, random_state=42).reset_index(drop=True)

In [0]:
balanced_pdf["class"].value_counts()

class
2    1430
1    1430
0    1430
Name: count, dtype: int64

In [0]:
# Prepare pipeline
pipeline = Pipeline([
    ("vect", CountVectorizer(stop_words="english")),
    ("tfidf", TfidfTransformer()),
    ("clf", LogisticRegression(max_iter=30))
])

In [0]:
# Split data into train and test sets
X = pdf["text"]
y = pdf["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
print(X_train[0])

!!!  : as a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...


In [0]:
y_train[0]

np.float64(2.0)

In [0]:
X_train

15272     :  well how else will white ppl get us to for...
9351     funny thing is....it's not just the people doi...
20323     : ": ": nigga messed with the wrong bitch &#1...
3638                                    bitch ass nigggaaa
20579                                  s/o that real bitch
                               ...                        
21575    the last at-bat at yankee stadium. thanks for ...
5390      lmfaoooo yooo i lost my elevator pass &#12855...
860      #porn,#android,#iphone,#ipad,#sex,#xxx, | #ana...
15795     : just when i thought justin bieber couldn't ...
23654    bitches ain&#8217;t shit, and they ain&#8217;t...
Name: text, Length: 19826, dtype: object

In [0]:
y_train

15272    0.0
9351     2.0
20323    1.0
3638     1.0
20579    1.0
        ... 
21575    2.0
5390     1.0
860      1.0
15795    1.0
23654    1.0
Name: label, Length: 19826, dtype: float64

In [0]:
# Fit on train data
model = pipeline.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [0]:
# Cellule 1 : Après l'entraînement du modèle, ajouter le wrapper
import mlflow.pyfunc
import pandas as pd

class TextClassifierWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, sklearn_model):
        self.model = sklearn_model
    
    def predict(self, context, model_input):
        # Extraire le texte du DataFrame
        if isinstance(model_input, pd.DataFrame):
            if "text" in model_input.columns:
                texts = model_input["text"].tolist()
            else:
                texts = model_input.iloc[:, 0].tolist()
        else:
            texts = model_input
        
        return self.model.predict(texts)

# Créer le wrapper avec ton modèle entraîné
wrapped_model = TextClassifierWrapper(model)



In [0]:
# Cellule 2 : Logger le modèle wrappé dans MLflow
import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

# Définir la signature
input_schema = Schema([ColSpec("string", "text")])
output_schema = Schema([ColSpec("integer")])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

# Logger le modèle wrappé
with mlflow.start_run():
    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=wrapped_model,
        registered_model_name="mlopsig2i.default.mlops_model",
        signature=signature,
        input_example=pd.DataFrame({"text": ["example text"]})
    )
    print("✅ Modèle wrappé enregistré dans MLflow!")

2025/12/17 10:28:56 INFO mlflow.pyfunc: Validating input example against model signature
Registered model 'mlopsig2i.default.mlops_model' already exists. Creating a new version of this model...


✅ Modèle wrappé enregistré dans MLflow!


Created version '10' of model 'mlopsig2i.default.mlops_model'.


In [0]:
text_model.predict(["je suis gentil"])

array([2.])

In [0]:
text_model.predict("je suis gentil")

np.float64(2.0)

In [0]:
X_test.iloc[4]

INFO:py4j.clientserver:Received command c on object id p0


"niggas cheat on they bitch and don't expect no pay back whatsoever. yall just as naive as these bitches."

In [0]:
X_test.iloc[5] = "je suis gentil"
X_test.iloc[5]

'je suis gentil'