# MLOps Pipeline using Apache Airflow

In [52]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

### load data

In [15]:
data = pd.read_csv("screentime_analysis.csv")

In [17]:
data.head()

Unnamed: 0,Date,App,Usage (minutes),Notifications,Times Opened
0,2024-08-07,Instagram,81,24,57
1,2024-08-08,Instagram,90,30,53
2,2024-08-26,Instagram,112,33,17
3,2024-08-22,Instagram,82,11,38
4,2024-08-12,Instagram,59,47,16


In [22]:
print(data['App'].value_counts())

App
Instagram      25
X              25
WhatsApp       25
8 Ball Pool    25
Safari         25
Netflix        25
Facebook       25
LinkedIn       25
Name: count, dtype: int64


In [14]:
print(data.isnull().sum())
print("-------")
print(data.duplicated().sum())

Date               0
App                0
Usage (minutes)    0
Notifications      0
Times Opened       0
dtype: int64
-------
0


In [24]:
data['Date'] = pd.to_datetime(data['Date'])
data['Date']

0     2024-08-07
1     2024-08-08
2     2024-08-26
3     2024-08-22
4     2024-08-12
         ...    
195   2024-08-10
196   2024-08-23
197   2024-08-18
198   2024-08-26
199   2024-08-02
Name: Date, Length: 200, dtype: datetime64[ns]

In [27]:
data['DayofWeek'] = data['Date'].dt.dayofweek
print(data['DayofWeek'])

0      2
1      3
2      0
3      3
4      0
      ..
195    5
196    4
197    6
198    0
199    4
Name: DayofWeek, Length: 200, dtype: int32


In [30]:
data['Month'] = data['Date'].dt.month
print(data['Month'])
print(data)

0      8
1      8
2      8
3      8
4      8
      ..
195    8
196    8
197    8
198    8
199    8
Name: Month, Length: 200, dtype: int32
          Date        App  Usage (minutes)  Notifications  Times Opened  \
0   2024-08-07  Instagram               81             24            57   
1   2024-08-08  Instagram               90             30            53   
2   2024-08-26  Instagram              112             33            17   
3   2024-08-22  Instagram               82             11            38   
4   2024-08-12  Instagram               59             47            16   
..         ...        ...              ...            ...           ...   
195 2024-08-10   LinkedIn               22             12             5   
196 2024-08-23   LinkedIn                5              7             1   
197 2024-08-18   LinkedIn               19              2             5   
198 2024-08-26   LinkedIn               21             14             1   
199 2024-08-02   LinkedIn            

In [38]:
#convert numerical to categorical features

In [37]:
data = pd.get_dummies(data, columns=['App'], drop_first=True)


KeyError: "None of [Index(['App'], dtype='object')] are in the [columns]"

In [40]:
data

Unnamed: 0,Date,Usage (minutes),Notifications,Times Opened,DayofWeek,Month,App_Facebook,App_Instagram,App_LinkedIn,App_Netflix,App_Safari,App_WhatsApp,App_X
0,2024-08-07,81,24,57,2,8,False,True,False,False,False,False,False
1,2024-08-08,90,30,53,3,8,False,True,False,False,False,False,False
2,2024-08-26,112,33,17,0,8,False,True,False,False,False,False,False
3,2024-08-22,82,11,38,3,8,False,True,False,False,False,False,False
4,2024-08-12,59,47,16,0,8,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2024-08-10,22,12,5,5,8,False,False,True,False,False,False,False
196,2024-08-23,5,7,1,4,8,False,False,True,False,False,False,False
197,2024-08-18,19,2,5,6,8,False,False,True,False,False,False,False
198,2024-08-26,21,14,1,0,8,False,False,True,False,False,False,False


In [43]:
scaler = MinMaxScaler()
data[['Notifications', 'Times Opened']] = scaler.fit_transform(data[['Notifications', 'Times Opened']] )

In [44]:
data

Unnamed: 0,Date,Usage (minutes),Notifications,Times Opened,DayofWeek,Month,App_Facebook,App_Instagram,App_LinkedIn,App_Netflix,App_Safari,App_WhatsApp,App_X
0,2024-08-07,81,0.163265,0.571429,2,8,False,True,False,False,False,False,False
1,2024-08-08,90,0.204082,0.530612,3,8,False,True,False,False,False,False,False
2,2024-08-26,112,0.224490,0.163265,0,8,False,True,False,False,False,False,False
3,2024-08-22,82,0.074830,0.377551,3,8,False,True,False,False,False,False,False
4,2024-08-12,59,0.319728,0.153061,0,8,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2024-08-10,22,0.081633,0.040816,5,8,False,False,True,False,False,False,False
196,2024-08-23,5,0.047619,0.000000,4,8,False,False,True,False,False,False,False
197,2024-08-18,19,0.013605,0.040816,6,8,False,False,True,False,False,False,False
198,2024-08-26,21,0.095238,0.000000,0,8,False,False,True,False,False,False,False


In [45]:
data['Previous_Day_Usage'] = data['Usage (minutes)'].shift(1)

In [46]:
data

Unnamed: 0,Date,Usage (minutes),Notifications,Times Opened,DayofWeek,Month,App_Facebook,App_Instagram,App_LinkedIn,App_Netflix,App_Safari,App_WhatsApp,App_X,Previous_Day_Usage
0,2024-08-07,81,0.163265,0.571429,2,8,False,True,False,False,False,False,False,
1,2024-08-08,90,0.204082,0.530612,3,8,False,True,False,False,False,False,False,81.0
2,2024-08-26,112,0.224490,0.163265,0,8,False,True,False,False,False,False,False,90.0
3,2024-08-22,82,0.074830,0.377551,3,8,False,True,False,False,False,False,False,112.0
4,2024-08-12,59,0.319728,0.153061,0,8,False,True,False,False,False,False,False,82.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2024-08-10,22,0.081633,0.040816,5,8,False,False,True,False,False,False,False,19.0
196,2024-08-23,5,0.047619,0.000000,4,8,False,False,True,False,False,False,False,22.0
197,2024-08-18,19,0.013605,0.040816,6,8,False,False,True,False,False,False,False,5.0
198,2024-08-26,21,0.095238,0.000000,0,8,False,False,True,False,False,False,False,19.0


In [47]:
data['Notifications_x_TimesOpened'] = data['Notifications'] * data['Times Opened']
data

Unnamed: 0,Date,Usage (minutes),Notifications,Times Opened,DayofWeek,Month,App_Facebook,App_Instagram,App_LinkedIn,App_Netflix,App_Safari,App_WhatsApp,App_X,Previous_Day_Usage,Notifications_x_TimesOpened
0,2024-08-07,81,0.163265,0.571429,2,8,False,True,False,False,False,False,False,,0.093294
1,2024-08-08,90,0.204082,0.530612,3,8,False,True,False,False,False,False,False,81.0,0.108288
2,2024-08-26,112,0.224490,0.163265,0,8,False,True,False,False,False,False,False,90.0,0.036651
3,2024-08-22,82,0.074830,0.377551,3,8,False,True,False,False,False,False,False,112.0,0.028252
4,2024-08-12,59,0.319728,0.153061,0,8,False,True,False,False,False,False,False,82.0,0.048938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2024-08-10,22,0.081633,0.040816,5,8,False,False,True,False,False,False,False,19.0,0.003332
196,2024-08-23,5,0.047619,0.000000,4,8,False,False,True,False,False,False,False,22.0,0.000000
197,2024-08-18,19,0.013605,0.040816,6,8,False,False,True,False,False,False,False,5.0,0.000555
198,2024-08-26,21,0.095238,0.000000,0,8,False,False,True,False,False,False,False,19.0,0.000000


In [49]:
data.isnull().sum()

Date                           0
Usage (minutes)                0
Notifications                  0
Times Opened                   0
DayofWeek                      0
Month                          0
App_Facebook                   0
App_Instagram                  0
App_LinkedIn                   0
App_Netflix                    0
App_Safari                     0
App_WhatsApp                   0
App_X                          0
Previous_Day_Usage             1
Notifications_x_TimesOpened    0
dtype: int64

In [50]:
##Save the new modified csv

In [51]:
data.to_csv('preprocessed_screentime_analysis.csv',index=False)

# Training Model

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [57]:
#split data into featiures and target variable
X = data.drop(columns=['Usage (minutes)','Date'])
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Notifications                200 non-null    float64
 1   Times Opened                 200 non-null    float64
 2   DayofWeek                    200 non-null    int32  
 3   Month                        200 non-null    int32  
 4   App_Facebook                 200 non-null    bool   
 5   App_Instagram                200 non-null    bool   
 6   App_LinkedIn                 200 non-null    bool   
 7   App_Netflix                  200 non-null    bool   
 8   App_Safari                   200 non-null    bool   
 9   App_WhatsApp                 200 non-null    bool   
 10  App_X                        200 non-null    bool   
 11  Previous_Day_Usage           199 non-null    float64
 12  Notifications_x_TimesOpened  200 non-null    float64
dtypes: bool(7), float64(

In [58]:
y = data['Usage (minutes)']

In [59]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [60]:
#model Initialization
model = RandomForestRegressor(random_state=42)

In [63]:
model.fit(X_train,y_train)

In [64]:
#model evalution
pred = model.predict(X_test)

In [71]:
mae = mean_absolute_error(y_test, pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 15.398500000000002


In [74]:
from sklearn.metrics import r2_score
print("R² Score:", r2_score(y_test, pred)) 


R² Score: 0.40818826675800046


# DAG (Directed Acyclic graphs)

In [77]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime



In [83]:
def preprocess_data():
    file_path = 'screentime_analysis.csv'
    data=pd.read_csv(file_path)

    data['Date'] = pd.to_datetime(data['Date'])
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data['Month'] = data['Date'].dt.month

    data = data.drop(columns=['Date'])

    data = pd.get_dummies(data, columns=['App'], drop_first=True)

    scaler = MinMaxScaler()
    data[['Notifications', 'Times Opened']] = scaler.fit_transform(data[['Notifications', 'Times Opened']])

    preprocessed_path = 'preprocessed_screentime_analysis.csv'
    data.to_csv(preprocessed_path, index=False)
    print(f"Preprocessed data saved to {preprocessed_path}")


In [81]:
#define the DAG
dag = DAG(

    dag_id='data_preprocessing',
    schedule_interval='@daily',
    start_date=datetime(2025,2,2),
    catchup=False,
)

In [84]:
#define the task
preprocess_task = PythonOperator(
    task_id = 'preprocess',
    python_callable=preprocess_data,
    dag=dag,
)