IMPORTING THE DEPENDENCIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#classifcation models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#importing the perforamnces evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

Data Collection and Processing

In [2]:
#loading the dataset into the Pandas Data Frame
df = pd.read_csv('/content/spotify_history (1).csv')

In [3]:
#checking the first five rows of the Data Frame
df.head()

Unnamed: 0,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped
0,2J3n32GeLmMjwuAzyhcSNe,2013-07-08 02:44:34,web player,3185,"Say It, Just Say It",The Mowgli's,Waiting For The Dawn,autoplay,clickrow,False,False
1,1oHxIPqJyvAYHy0PVrDU98,2013-07-08 02:45:37,web player,61865,Drinking from the Bottle (feat. Tinie Tempah),Calvin Harris,18 Months,clickrow,clickrow,False,False
2,487OPlneJNni3NWC8SYqhW,2013-07-08 02:50:24,web player,285386,Born To Die,Lana Del Rey,Born To Die - The Paradise Edition,clickrow,unknown,False,False
3,5IyblF777jLZj1vGHG2UD3,2013-07-08 02:52:40,web player,134022,Off To The Races,Lana Del Rey,Born To Die - The Paradise Edition,trackdone,clickrow,False,False
4,0GgAAB0ZMllFhbNc3mAodO,2013-07-08 03:17:52,web player,0,Half Mast,Empire Of The Sun,Walking On A Dream,clickrow,nextbtn,False,False


In [4]:
#checking the last ten rows of the Data Frame
df.tail()

Unnamed: 0,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped
149855,4Fz1WWr5o0OrlIcZxcyZtK,2024-12-15 23:06:19,android,1247,On The Way Home,John Mayer,Paradise Valley,fwdbtn,fwdbtn,True,True
149856,0qHMhBZqYb99yhX9BHcIkV,2024-12-15 23:06:21,android,1515,Magical Mystery Tour - Remastered 2009,The Beatles,Magical Mystery Tour,fwdbtn,fwdbtn,True,True
149857,0HHdujGjOZChTrl8lJWEIq,2024-12-15 23:06:22,android,1283,"Stop This Train - Live at the Nokia Theatre, L...",John Mayer,Where the Light Is: John Mayer Live In Los Ang...,fwdbtn,fwdbtn,True,True
149858,7peh6LUcdNPcMdrSH4JPsM,2024-12-15 23:06:23,android,1306,I Don't Trust Myself (With Loving You),John Mayer,Continuum,fwdbtn,fwdbtn,True,True
149859,6iGU74CwXuT4XVepjc9Emf,2024-12-15 23:06:25,android,1893,God Only Knows - Mono,The Beach Boys,Pet Sounds,fwdbtn,fwdbtn,True,True


In [5]:
# get basic information about the Data Frame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149860 entries, 0 to 149859
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   spotify_track_uri  149860 non-null  object
 1   ts                 149860 non-null  object
 2   platform           149860 non-null  object
 3   ms_played          149860 non-null  int64 
 4   track_name         149860 non-null  object
 5   artist_name        149860 non-null  object
 6   album_name         149860 non-null  object
 7   reason_start       149717 non-null  object
 8   reason_end         149743 non-null  object
 9   shuffle            149860 non-null  bool  
 10  skipped            149860 non-null  bool  
dtypes: bool(2), int64(1), object(8)
memory usage: 10.6+ MB


In [6]:
#checking the features of the Data Frame
df.columns

Index(['spotify_track_uri', 'ts', 'platform', 'ms_played', 'track_name',
       'artist_name', 'album_name', 'reason_start', 'reason_end', 'shuffle',
       'skipped'],
      dtype='object')

**Variable Description**

1. spotify_track_uri: The unique identifier for a track on Spotify.

2. ts: A timestamp representing when the track was played.

3. platform: The platform used to play the track (e.g., mobile, desktop, web).

4. ms_played: The duration for which the track was played, measured in milliseconds.

5. track_name: The name of the track.

6. artist_name: The name of the artist who performed the track.

7. album_name: The name of the album to which the track belongs.

8. reason_start: The reason why the track was started (e.g., user action, autoplay).

9. reason_end: The reason why the track was stopped (e.g., user action, end of track).

10. shuffle: Indicates whether the track was played in shuffle mode (boolean value).

11. skipped: Indicates whether the track was skipped (boolean value)

In [7]:
#checking for the numbers of rows and columns in the data Frame#checking for the numbers of rows and columns in the data Frame
df.shape

(149860, 11)

In [8]:
#checking for the presence of any missing values in all the columns of the Data Frame
df.isna().sum()

Unnamed: 0,0
spotify_track_uri,0
ts,0
platform,0
ms_played,0
track_name,0
artist_name,0
album_name,0
reason_start,143
reason_end,117
shuffle,0


Data Cleaning

In [9]:
df['reason_start'].unique()

array(['autoplay', 'clickrow', 'trackdone', 'nextbtn', 'backbtn',
       'unknown', 'popup', 'appload', 'fwdbtn', 'trackerror', nan,
       'remote', 'endplay', 'playbtn'], dtype=object)

In [10]:
df['reason_start'].value_counts()

Unnamed: 0_level_0,count
reason_start,Unnamed: 1_level_1
trackdone,76655
fwdbtn,53793
clickrow,11228
appload,3729
backbtn,2205
playbtn,1458
remote,477
trackerror,120
unknown,23
nextbtn,21


In [11]:
#fixing the missing values of the reason_start and the reason_end features
df['reason_start'] = df['reason_start'].fillna(method='ffill')
df['reason_end'] = df['reason_end'].fillna(method='ffill')

  df['reason_start'] = df['reason_start'].fillna(method='ffill')
  df['reason_end'] = df['reason_end'].fillna(method='ffill')


In [12]:
#checking for any missing values after filling the missing values in the reason_start and the reason_end features
df.isna().sum()

Unnamed: 0,0
spotify_track_uri,0
ts,0
platform,0
ms_played,0
track_name,0
artist_name,0
album_name,0
reason_start,0
reason_end,0
shuffle,0


In [13]:
#checking the timestamp column and separating the time, data and months
df['ts']

Unnamed: 0,ts
0,2013-07-08 02:44:34
1,2013-07-08 02:45:37
2,2013-07-08 02:50:24
3,2013-07-08 02:52:40
4,2013-07-08 03:17:52
...,...
149855,2024-12-15 23:06:19
149856,2024-12-15 23:06:21
149857,2024-12-15 23:06:22
149858,2024-12-15 23:06:23


In [14]:
#converting the timestamp into a date and time data type
df['ts'] = pd.to_datetime(df['ts'])

In [15]:
#checking the data types of the features
df.dtypes

Unnamed: 0,0
spotify_track_uri,object
ts,datetime64[ns]
platform,object
ms_played,int64
track_name,object
artist_name,object
album_name,object
reason_start,object
reason_end,object
shuffle,bool


In [16]:
#creating the date features from the timestamp column
df['date'] = pd.to_datetime(df['ts'].dt.date)
#creating the time column from the timestamp
df['time'] = df['ts'].dt.time
#creating the month feature
df['month'] = df['ts'].dt.month
#creating the hour feature
df['hour'] = df['ts'].dt.hour
#creating the day_name feature
df['day_name'] = df['date'].dt.day_name()
#creating the month name feature
df['month_name'] = df['date'].dt.month_name()
#creating the month name feature
df['day'] = df['ts'].dt.day
#creating the year feature
df['year'] = df['ts'].dt.year
#creating the second feature
df['second'] = df['ts'].dt.second
#creating  the minute feature
df['minute'] = df['ts'].dt.minute

In [17]:
df.head()

Unnamed: 0,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,...,date,time,month,hour,day_name,month_name,day,year,second,minute
0,2J3n32GeLmMjwuAzyhcSNe,2013-07-08 02:44:34,web player,3185,"Say It, Just Say It",The Mowgli's,Waiting For The Dawn,autoplay,clickrow,False,...,2013-07-08,02:44:34,7,2,Monday,July,8,2013,34,44
1,1oHxIPqJyvAYHy0PVrDU98,2013-07-08 02:45:37,web player,61865,Drinking from the Bottle (feat. Tinie Tempah),Calvin Harris,18 Months,clickrow,clickrow,False,...,2013-07-08,02:45:37,7,2,Monday,July,8,2013,37,45
2,487OPlneJNni3NWC8SYqhW,2013-07-08 02:50:24,web player,285386,Born To Die,Lana Del Rey,Born To Die - The Paradise Edition,clickrow,unknown,False,...,2013-07-08,02:50:24,7,2,Monday,July,8,2013,24,50
3,5IyblF777jLZj1vGHG2UD3,2013-07-08 02:52:40,web player,134022,Off To The Races,Lana Del Rey,Born To Die - The Paradise Edition,trackdone,clickrow,False,...,2013-07-08,02:52:40,7,2,Monday,July,8,2013,40,52
4,0GgAAB0ZMllFhbNc3mAodO,2013-07-08 03:17:52,web player,0,Half Mast,Empire Of The Sun,Walking On A Dream,clickrow,nextbtn,False,...,2013-07-08,03:17:52,7,3,Monday,July,8,2013,52,17


In [18]:
df['platform'].unique()

array(['web player', 'windows', 'android', 'iOS', 'cast to device', 'mac'],
      dtype=object)

In [19]:
df['track_name'].unique()

array(['Say It, Just Say It',
       'Drinking from the Bottle (feat. Tinie Tempah)', 'Born To Die',
       ..., 'Oo-De-Lally - From "Robin Hood"', 'King Of The Road',
       'Chug-A-Lug'], dtype=object)

In [20]:
#Detect repeated plays within a month
for track in df['track_name'].unique():
    track_data = df[df['track_name'] == track]
    if len(track_data) > 1:
        df.loc[df['track_name'] == track, 'RepeatPlay'] = 1
    else:
        df.loc[df['track_name'] == track, 'RepeatPlay'] = 0
# Display the DataFrame
print(df)

             spotify_track_uri                  ts    platform  ms_played  \
0       2J3n32GeLmMjwuAzyhcSNe 2013-07-08 02:44:34  web player       3185   
1       1oHxIPqJyvAYHy0PVrDU98 2013-07-08 02:45:37  web player      61865   
2       487OPlneJNni3NWC8SYqhW 2013-07-08 02:50:24  web player     285386   
3       5IyblF777jLZj1vGHG2UD3 2013-07-08 02:52:40  web player     134022   
4       0GgAAB0ZMllFhbNc3mAodO 2013-07-08 03:17:52  web player          0   
...                        ...                 ...         ...        ...   
149855  4Fz1WWr5o0OrlIcZxcyZtK 2024-12-15 23:06:19     android       1247   
149856  0qHMhBZqYb99yhX9BHcIkV 2024-12-15 23:06:21     android       1515   
149857  0HHdujGjOZChTrl8lJWEIq 2024-12-15 23:06:22     android       1283   
149858  7peh6LUcdNPcMdrSH4JPsM 2024-12-15 23:06:23     android       1306   
149859  6iGU74CwXuT4XVepjc9Emf 2024-12-15 23:06:25     android       1893   

                                               track_name        artist_nam

In [39]:
df.head()

Unnamed: 0,platform,ms_played,reason_start,reason_end,shuffle,skipped,date,time,month,hour,day_name,month_name,day,year,second,minute,RepeatPlay,nlp
0,4,3185,1,2,0,0,2013-07-08,02:44:34,7,2,Monday,July,8,2013,34,44,0,say say mowgli wait dawn
1,4,61865,3,2,0,0,2013-07-08,02:45:37,7,2,Monday,July,8,2013,37,45,1,drink bottl feat tini tempah calvin harri month
2,4,285386,3,14,0,0,2013-07-08,02:50:24,7,2,Monday,July,8,2013,24,50,1,born die lana del rey born die paradis edit
3,4,134022,10,2,0,0,2013-07-08,02:52:40,7,2,Monday,July,8,2013,40,52,0,race lana del rey born die paradis edit
4,4,0,3,6,0,0,2013-07-08,03:17:52,7,3,Monday,July,8,2013,52,17,0,half mast empir sun walk dream


Unnamed: 0,platform,ms_played,reason_start,reason_end,shuffle,skipped,date,time,month,hour,day_name,month_name,day,year,second,minute,RepeatPlay,nlp
0,4,3185,1,2,0,0,2013-07-08,02:44:34,7,2,Monday,July,8,2013,34,44,0,say say mowgli wait dawn
1,4,61865,3,2,0,0,2013-07-08,02:45:37,7,2,Monday,July,8,2013,37,45,1,drink bottl feat tini tempah calvin harri month
2,4,285386,3,14,0,0,2013-07-08,02:50:24,7,2,Monday,July,8,2013,24,50,1,born die lana del rey born die paradis edit
3,4,134022,10,2,0,0,2013-07-08,02:52:40,7,2,Monday,July,8,2013,40,52,0,race lana del rey born die paradis edit
4,4,0,3,6,0,0,2013-07-08,03:17:52,7,3,Monday,July,8,2013,52,17,0,half mast empir sun walk dream


In [22]:
df['RepeatPlay'].dtype

dtype('float64')

In [23]:
#coverting the data type
df['RepeatPlay'] = df['RepeatPlay'].astype('Int64')

In [24]:
df.head()

Unnamed: 0,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,...,time,month,hour,day_name,month_name,day,year,second,minute,RepeatPlay
0,2J3n32GeLmMjwuAzyhcSNe,2013-07-08 02:44:34,web player,3185,"Say It, Just Say It",The Mowgli's,Waiting For The Dawn,autoplay,clickrow,False,...,02:44:34,7,2,Monday,July,8,2013,34,44,0
1,1oHxIPqJyvAYHy0PVrDU98,2013-07-08 02:45:37,web player,61865,Drinking from the Bottle (feat. Tinie Tempah),Calvin Harris,18 Months,clickrow,clickrow,False,...,02:45:37,7,2,Monday,July,8,2013,37,45,1
2,487OPlneJNni3NWC8SYqhW,2013-07-08 02:50:24,web player,285386,Born To Die,Lana Del Rey,Born To Die - The Paradise Edition,clickrow,unknown,False,...,02:50:24,7,2,Monday,July,8,2013,24,50,1
3,5IyblF777jLZj1vGHG2UD3,2013-07-08 02:52:40,web player,134022,Off To The Races,Lana Del Rey,Born To Die - The Paradise Edition,trackdone,clickrow,False,...,02:52:40,7,2,Monday,July,8,2013,40,52,0
4,0GgAAB0ZMllFhbNc3mAodO,2013-07-08 03:17:52,web player,0,Half Mast,Empire Of The Sun,Walking On A Dream,clickrow,nextbtn,False,...,03:17:52,7,3,Monday,July,8,2013,52,17,0


LABEL ENCODING

In [25]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()
label_encoder1 = LabelEncoder()
label_encoder2 = LabelEncoder()
label_encoder3 = LabelEncoder()
label_encoder4 = LabelEncoder()

# Fit and transform the data
df['platform'] = label_encoder.fit_transform(df['platform'])
df['reason_start'] = label_encoder1.fit_transform(df['reason_start'])
df['reason_end'] = label_encoder2.fit_transform(df['reason_end'])
df['shuffle'] = label_encoder3.fit_transform(df['shuffle'])
df['skipped'] = label_encoder4.fit_transform(df['skipped'])

In [26]:
#printing out the five first rows after the label encoding
df.head()

Unnamed: 0,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,...,time,month,hour,day_name,month_name,day,year,second,minute,RepeatPlay
0,2J3n32GeLmMjwuAzyhcSNe,2013-07-08 02:44:34,4,3185,"Say It, Just Say It",The Mowgli's,Waiting For The Dawn,1,2,0,...,02:44:34,7,2,Monday,July,8,2013,34,44,0
1,1oHxIPqJyvAYHy0PVrDU98,2013-07-08 02:45:37,4,61865,Drinking from the Bottle (feat. Tinie Tempah),Calvin Harris,18 Months,3,2,0,...,02:45:37,7,2,Monday,July,8,2013,37,45,1
2,487OPlneJNni3NWC8SYqhW,2013-07-08 02:50:24,4,285386,Born To Die,Lana Del Rey,Born To Die - The Paradise Edition,3,14,0,...,02:50:24,7,2,Monday,July,8,2013,24,50,1
3,5IyblF777jLZj1vGHG2UD3,2013-07-08 02:52:40,4,134022,Off To The Races,Lana Del Rey,Born To Die - The Paradise Edition,10,2,0,...,02:52:40,7,2,Monday,July,8,2013,40,52,0
4,0GgAAB0ZMllFhbNc3mAodO,2013-07-08 03:17:52,4,0,Half Mast,Empire Of The Sun,Walking On A Dream,3,6,0,...,03:17:52,7,3,Monday,July,8,2013,52,17,0


In [27]:
#dropping the ts and the spotify_track_url
df.drop(['ts','spotify_track_uri'], axis = 1, inplace = True)

In [28]:
df.head()

Unnamed: 0,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped,date,time,month,hour,day_name,month_name,day,year,second,minute,RepeatPlay
0,4,3185,"Say It, Just Say It",The Mowgli's,Waiting For The Dawn,1,2,0,0,2013-07-08,02:44:34,7,2,Monday,July,8,2013,34,44,0
1,4,61865,Drinking from the Bottle (feat. Tinie Tempah),Calvin Harris,18 Months,3,2,0,0,2013-07-08,02:45:37,7,2,Monday,July,8,2013,37,45,1
2,4,285386,Born To Die,Lana Del Rey,Born To Die - The Paradise Edition,3,14,0,0,2013-07-08,02:50:24,7,2,Monday,July,8,2013,24,50,1
3,4,134022,Off To The Races,Lana Del Rey,Born To Die - The Paradise Edition,10,2,0,0,2013-07-08,02:52:40,7,2,Monday,July,8,2013,40,52,0
4,4,0,Half Mast,Empire Of The Sun,Walking On A Dream,3,6,0,0,2013-07-08,03:17:52,7,3,Monday,July,8,2013,52,17,0


In [29]:
df['nlp'] = df['track_name'] + ' '+ df['artist_name'] + ' '+df['album_name']

In [30]:
df['nlp']

Unnamed: 0,nlp
0,"Say It, Just Say It The Mowgli's Waiting For T..."
1,Drinking from the Bottle (feat. Tinie Tempah) ...
2,Born To Die Lana Del Rey Born To Die - The Par...
3,Off To The Races Lana Del Rey Born To Die - Th...
4,Half Mast Empire Of The Sun Walking On A Dream
...,...
149855,On The Way Home John Mayer Paradise Valley
149856,Magical Mystery Tour - Remastered 2009 The Bea...
149857,"Stop This Train - Live at the Nokia Theatre, L..."
149858,I Don't Trust Myself (With Loving You) John Ma...


In [31]:
port_stem = PorterStemmer()
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
#creating a function for cleaning and stemming the textual feature
def cleaning(message):
  cleaned_message = re.sub('[^a-zA-Z]',' ', message)
  cleaned_message = cleaned_message.lower()
  cleaned_message = cleaned_message.split()
  cleaned_message = [port_stem.stem(word) for word in cleaned_message if not word in stopwords.words('english')]
  cleaned_message = ' '.join(cleaned_message)

  return cleaned_message

In [33]:
df['nlp'] = df['nlp'].apply(cleaning)

In [34]:
df.head()

Unnamed: 0,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped,date,...,month,hour,day_name,month_name,day,year,second,minute,RepeatPlay,nlp
0,4,3185,"Say It, Just Say It",The Mowgli's,Waiting For The Dawn,1,2,0,0,2013-07-08,...,7,2,Monday,July,8,2013,34,44,0,say say mowgli wait dawn
1,4,61865,Drinking from the Bottle (feat. Tinie Tempah),Calvin Harris,18 Months,3,2,0,0,2013-07-08,...,7,2,Monday,July,8,2013,37,45,1,drink bottl feat tini tempah calvin harri month
2,4,285386,Born To Die,Lana Del Rey,Born To Die - The Paradise Edition,3,14,0,0,2013-07-08,...,7,2,Monday,July,8,2013,24,50,1,born die lana del rey born die paradis edit
3,4,134022,Off To The Races,Lana Del Rey,Born To Die - The Paradise Edition,10,2,0,0,2013-07-08,...,7,2,Monday,July,8,2013,40,52,0,race lana del rey born die paradis edit
4,4,0,Half Mast,Empire Of The Sun,Walking On A Dream,3,6,0,0,2013-07-08,...,7,3,Monday,July,8,2013,52,17,0,half mast empir sun walk dream


In [35]:
#dropping the track_name, artist_name and the album name column
df.drop(['track_name','artist_name','album_name'], axis = 1, inplace = True)

In [36]:
df.head()

Unnamed: 0,platform,ms_played,reason_start,reason_end,shuffle,skipped,date,time,month,hour,day_name,month_name,day,year,second,minute,RepeatPlay,nlp
0,4,3185,1,2,0,0,2013-07-08,02:44:34,7,2,Monday,July,8,2013,34,44,0,say say mowgli wait dawn
1,4,61865,3,2,0,0,2013-07-08,02:45:37,7,2,Monday,July,8,2013,37,45,1,drink bottl feat tini tempah calvin harri month
2,4,285386,3,14,0,0,2013-07-08,02:50:24,7,2,Monday,July,8,2013,24,50,1,born die lana del rey born die paradis edit
3,4,134022,10,2,0,0,2013-07-08,02:52:40,7,2,Monday,July,8,2013,40,52,0,race lana del rey born die paradis edit
4,4,0,3,6,0,0,2013-07-08,03:17:52,7,3,Monday,July,8,2013,52,17,0,half mast empir sun walk dream


In [40]:
text_features = df['nlp']

In [41]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words = 'english', lowercase= True)
text_features  = feature_extraction.fit_transform(text_features)

In [41]:
text_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 864562 stored elements and shape (149860, 13507)>

In [42]:
numerical_data = df.drop(['date','time','day_name','month_name','nlp'], axis = 1)

In [43]:
numerical_data.head()

Unnamed: 0,platform,ms_played,reason_start,reason_end,shuffle,skipped,month,hour,day,year,second,minute,RepeatPlay
0,4,3185,1,2,0,0,7,2,8,2013,34,44,0
1,4,61865,3,2,0,0,7,2,8,2013,37,45,1
2,4,285386,3,14,0,0,7,2,8,2013,24,50,1
3,4,134022,10,2,0,0,7,2,8,2013,40,52,0
4,4,0,3,6,0,0,7,3,8,2013,52,17,0


In [78]:
df['RepeatPlay'].value_counts()

Unnamed: 0_level_0,count
RepeatPlay,Unnamed: 1_level_1
1,142861
0,6999


In [45]:
numerical_data = numerical_data.drop(['RepeatPlay'], axis = 1)

In [46]:
numerical_data.head()

Unnamed: 0,platform,ms_played,reason_start,reason_end,shuffle,skipped,month,hour,day,year,second,minute
0,4,3185,1,2,0,0,7,2,8,2013,34,44
1,4,61865,3,2,0,0,7,2,8,2013,37,45
2,4,285386,3,14,0,0,7,2,8,2013,24,50
3,4,134022,10,2,0,0,7,2,8,2013,40,52
4,4,0,3,6,0,0,7,3,8,2013,52,17


In [61]:
numerical_data.values

array([[     4,   3185,      1, ...,   2013,     34,     44],
       [     4,  61865,      3, ...,   2013,     37,     45],
       [     4, 285386,      3, ...,   2013,     24,     50],
       ...,
       [     0,   1283,      5, ...,   2024,     22,      6],
       [     0,   1306,      5, ...,   2024,     23,      6],
       [     0,   1893,      5, ...,   2024,     25,      6]])

STANDARDIZATION

In [63]:
#Standardize the numerical data
standardize = StandardScaler()

In [64]:
standardized_numerical_data = standardize.fit_transform(numerical_data)

In [66]:
#printing out the standaradize data
standardized_numerical_data

array([[ 5.66746507, -1.06188042, -2.11819887, ..., -2.87079384,
         0.25719734,  0.82288569],
       [ 5.66746507, -0.56391567, -1.44355173, ..., -2.87079384,
         0.43019864,  0.88082346],
       [ 5.66746507,  1.3329074 , -1.44355173, ..., -2.87079384,
        -0.31947366,  1.17051232],
       ...,
       [-0.22489543, -1.07802099, -0.7689046 , ...,  1.80403382,
        -0.43480786, -1.37874962],
       [-0.22489543, -1.07782581, -0.7689046 , ...,  1.80403382,
        -0.37714076, -1.37874962],
       [-0.22489543, -1.07284447, -0.7689046 , ...,  1.80403382,
        -0.26180656, -1.37874962]])

In [67]:
from scipy.sparse import csr_matrix, hstack

# Convert numerical data to a sparse matrix format
numerical_sparse_matrix = csr_matrix(standardized_numerical_data)

# Combine the sparse matrices using hstack
combined_sparse_matrix = hstack([numerical_sparse_matrix, text_features])

print(combined_sparse_matrix.shape)  # Ensure the shape matches expectations


(149860, 13519)


In [79]:
Y = df['RepeatPlay']

In [81]:
#Independent and dependent variable
X = combined_sparse_matrix
Y = df['RepeatPlay']

DATA SAMPLING: Oversampling

In [82]:
#oversamppling the target variable
#importing the Randomoversampler
from imblearn.over_sampling import RandomOverSampler

# Define the oversampler
ros = RandomOverSampler(sampling_strategy=1, random_state=42)
# Perform the oversampling
X, Y = ros.fit_resample(X, Y)

# Verify the class distribution
Y.value_counts()

Unnamed: 0_level_0,count
RepeatPlay,Unnamed: 1_level_1
0,142861
1,142861


TRAIN, TEST, SPLIT

In [83]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [84]:
print(x_train.shape, y_train.shape)

(228577, 13519) (228577,)


In [85]:
print(x_test.shape, y_test.shape)

(57145, 13519) (57145,)


Model Training and Prediction

In [91]:
# Create a dictionary to store the classification models
models = {
    #'SVC': SVC(kernel='poly'),
    'RandomForest': RandomForestClassifier(),
    'KNeighbors': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(max_iter= 1000)
}

In [92]:
# Loop through the models, train, test, and print results
for model_name, model in models.items():
    # Train the model
    model.fit(x_train, y_train)

    # Test the model
    predictions = model.predict(x_test)

    # Calculate accuracy for the test data
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy}")

    # using the classification Report to evaluate model
    print(f'{model_name}: Classification Report')
    print(classification_report(y_test, predictions))

    # Calculate confusion matrix
    cm = confusion_matrix(y_test, predictions)
    print(f"{model_name} Confusion Matrix:")
    print(np.array2string(cm, separator=', '))

    print("\n" + "="*40 + "\n")

RandomForest Accuracy: 0.9939452270539855
RandomForest: Classification Report
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     28687
         1.0       1.00      0.99      0.99     28458

    accuracy                           0.99     57145
   macro avg       0.99      0.99      0.99     57145
weighted avg       0.99      0.99      0.99     57145

RandomForest Confusion Matrix:
[[28687,     0],
 [  346, 28112]]


KNeighbors Accuracy: 0.9579840755971651
KNeighbors: Classification Report
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     28687
         1.0       1.00      0.92      0.96     28458

    accuracy                           0.96     57145
   macro avg       0.96      0.96      0.96     57145
weighted avg       0.96      0.96      0.96     57145

KNeighbors Confusion Matrix:
[[28687,     0],
 [ 2401, 26057]]


LogisticRegression Accuracy: 0.9428296438883542
LogisticRegres