In [57]:
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [58]:
#check if seaborn is installed

import subprocess

try:
    subprocess.check_call(['pip', 'show', 'seaborn'])
except subprocess.CalledProcessError:
    # Seaborn is not installed
    %pip install seaborn
else:
    print('Seaborn is already installed.')

Seaborn is already installed.


In [59]:
#check if plotly is installed

import subprocess

try:
    subprocess.check_call(['pip', 'show', 'plotly'])
except subprocess.CalledProcessError:
    # plotly is not installed
    %pip install plotly
else:
    print('plotly is already installed.')



In [None]:
#check if xgboost is installed

import subprocess

try:
    subprocess.check_call(['pip', 'show', 'xgboost'])
except subprocess.CalledProcessError:
    # xgboost is not installed
    %pip install xgboost
else:
    print('xgboost is already installed.')

In [None]:
# import necessary libraries

import pandas as pd
import numpy as np
import xgboost as xgb


from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import preprocessing
import matplotlib 
matplotlib.style.use('ggplot')
from sklearn.preprocessing import LabelEncoder


# Set display options to show all rows and columns
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [None]:
# importing openpyxl module
import openpyxl

# input excel file path
inputExcelFile ="MorpaData.xlsx"

# creating or loading an excel workbook
newWorkbook = openpyxl.load_workbook(inputExcelFile)

# printing all the sheetnames in an excel file using sheetnames attribute
print('The Sheet names of the given excel file: ')

# Getting the sheetnames as a list using the sheetnames attribute
sheetNames=newWorkbook.sheetnames

# Traversing in the sheetNames list
for name in sheetNames:
   print(name)

In [None]:
# get the member data from excel

member_df = pd.read_excel('MorpaData.xlsx',sheet_name='Üye Listesi')
print(member_df.head(10),'\n\n')

print(member_df.info())

# get the unique number of members

print(member_df['Üye'].nunique())

In [None]:
# get the Login logs data from excel

login_df = pd.read_excel('MorpaData.xlsx',sheet_name='Giriş Logları')
login_df['Giriş Zamanı'] = pd.to_datetime(login_df['Giriş Zamanı'], format='%d/%m/%Y %H:%M:%S')
login_df['Çıkış Zamanı'] = pd.to_datetime(login_df['Çıkış Zamanı'], format='%d/%m/%Y %H:%M:%S')
print(login_df.head(10),'\n\n')

print(login_df.info(),'\n\n')


# get the unique number of members on login data
print("unique member count on login data: ",login_df['Üye'].nunique())

In [None]:
# get the lecturing logs data from excel

lecturing_df = pd.read_excel('MorpaData.xlsx',sheet_name='Konu anlatımı')
lecturing_df['Giriş Zamanı'] = pd.to_datetime(lecturing_df['Giriş Zamanı'], format='%d/%m/%Y %H:%M:%S')
lecturing_df['Çıkış Zamanı'] = pd.to_datetime(lecturing_df['Çıkış Zamanı'], format='%d/%m/%Y %H:%M:%S')

# calculate the duration of the lecturing

lecturing_df['lec_Süre'] = (lecturing_df['Çıkış Zamanı'] - lecturing_df['Giriş Zamanı']).dt.total_seconds().astype(int)

print(lecturing_df.head(10),'\n\n')

print(lecturing_df.info(),'\n\n')


# get the unique number of members on lecturing data
print("Unique number of users on lecturing: ", lecturing_df['Üye'].nunique())

In [None]:
# get the member studies data from excel

studies_df = pd.read_excel('MorpaData.xlsx',sheet_name='çalışmalar')
studies_df['Giriş Zamanı'] = pd.to_datetime(studies_df['Giriş Zamanı'], format='%d/%m/%Y %H:%M:%S')
studies_df['Çıkış Zamanı'] = pd.to_datetime(studies_df['Çıkış Zamanı'], format='%d/%m/%Y %H:%M:%S')

studies_df['stud_Süre'] = (studies_df['Çıkış Zamanı'] - studies_df['Giriş Zamanı']).dt.total_seconds().astype(int)

print(studies_df.head(10),'\n\n')

print(studies_df.info(),'\n\n')

# get the unique number of members on studies data
print("unique member count on studies data: ",studies_df['Üye'].nunique())

In [None]:
# get the member exams data from excel

exams_df = pd.read_excel('MorpaData.xlsx',sheet_name='Sınav')
exams_df['Giriş Zamanı'] = exams_df['Giriş Zamanı'].apply(lambda x: x.strftime("%d/%m/%Y  %H:%M:%S"))
exams_df['Çıkış Zamanı'] = exams_df['Çıkış Zamanı'].apply(lambda x: x.strftime("%d/%m/%Y  %H:%M:%S"))

exams_df['Giriş Zamanı'] = pd.to_datetime(exams_df['Giriş Zamanı'], format='%d/%m/%Y %H:%M:%S')
exams_df['Çıkış Zamanı'] = pd.to_datetime(exams_df['Çıkış Zamanı'], format='%d/%m/%Y %H:%M:%S')

exams_df['Giriş Zamanı']

print(exams_df.head(10),'\n\n')

print(exams_df.info(),'\n\n')

# get the unique number of members on exams data
print("unique member count on exams data: ",exams_df['Üye'].nunique())


# wrong answers are not deducted from the total score.

In [None]:
# get the member subject data from excel

subject_df = pd.read_excel('MorpaData.xlsx',sheet_name='konu')
subject_df.head(10)

#sort according to aktif materyal sayısı

subject_df.sort_values(by=['Aktif Materyal Sayısı'],ascending=False)



In [None]:
#left join the exam and subject dataframes

exam_subject_df = pd.merge(exams_df,subject_df,how='left',left_on='Konu',right_on='Konu')

# change column names for further join operations
exam_subject_df.rename(
    columns={"Giriş Zamanı": "S_Giriş Zamanı", "Çıkış Zamanı": "S_Çıkış Zamanı", "Süre":"S_süre"},
    inplace=True,
)


In [None]:
# drop non-numeric sınıf rows

exam_subject_df = exam_subject_df[exam_subject_df['Sınıf'].notna()]

exam_subject_df

In [None]:
exam_subject_df['Sınıf'] = exam_subject_df['Sınıf'].astype('int64')
exam_subject_df['Ders'] = exam_subject_df['Ders'].astype('int64')
exam_subject_df['Aktif Materyal Sayısı'] = exam_subject_df['Aktif Materyal Sayısı'].astype('int64')
exam_subject_df['Toplam Materyal Sayısı'] = exam_subject_df['Toplam Materyal Sayısı'].astype('int64')

exam_subject_df.head(10)

In [None]:
# add cumulative lecture time to exam_subject dataframe for each member according to the subject prior to exam date s_Giriş Zamanı

# calculate cumulative lecture time before the exam date for each member of each subject
exam_subject_df['cum_lecture_time'] = 0
for index, row in exam_subject_df.iterrows():
    df = lecturing_df[lecturing_df['Üye'] == row['Üye']]
    df = df[df['Konu'] == row['Konu']]
    df = df[df['Giriş Zamanı'] < row['S_Giriş Zamanı']]
    df['lec_Süre'] = df['lec_Süre'].astype('int64')
    exam_subject_df.at[index,'cum_lecture_time'] = df['lec_Süre'].sum()

# calculate cumulative study time before the exam date for each member of each subject
exam_subject_df['cum_study_time'] = 0
for index, row in exam_subject_df.iterrows():
    df = studies_df[studies_df['Üye'] == row['Üye']]
    df = df[df['Konu'] == row['Konu']]
    df = df[df['Giriş Zamanı'] < row['S_Giriş Zamanı']]
    df['stud_Süre'] = df['stud_Süre'].astype('int64')
    exam_subject_df.at[index,'cum_study_time'] = df['stud_Süre'].sum()

exam_subject_df['total_study_time'] = exam_subject_df['cum_lecture_time'] + exam_subject_df['cum_study_time']
        
exam_subject_df



In [None]:
# to tag the members who attempt gaming the system, I crate a deep copy of exams_df to exams_df_gtsLabeled.

exams_df_gtsLabeled = exam_subject_df.copy(deep=True)

# create a new column for labeling and overlapping time

exams_df_gtsLabeled['Label'] = ""
exams_df_gtsLabeled['Overlap (secs.)'] = ""

# update the index of exams_df_gtsLabeled

exams_df_gtsLabeled.reset_index(drop=True, inplace=True)

# then we write the conditions for labeling the data who "Gaming the System". 1 for gaming, 0 for not gaming.

for i in range(1,len(exams_df_gtsLabeled)):
    if exams_df_gtsLabeled['Üye'][i-1] == exams_df_gtsLabeled['Üye'][i]:
        if exams_df_gtsLabeled['Sınav'][i-1] == exams_df_gtsLabeled['Sınav'][i]:
            if exams_df_gtsLabeled['Konu'][i-1] == exams_df_gtsLabeled['Konu'][i]:
                if (exams_df_gtsLabeled['S_Çıkış Zamanı'][i-1] > exams_df_gtsLabeled['S_Giriş Zamanı'][i]) and (exams_df_gtsLabeled['Puan'][i-1] < exams_df_gtsLabeled['Puan'][i]) and (exams_df_gtsLabeled['GirişLog'][i-1] == exams_df_gtsLabeled['GirişLog'][i]):
                    exams_df_gtsLabeled['Label'][i] = 1
                    exams_df_gtsLabeled['Overlap (secs.)'][i] = (exams_df_gtsLabeled['S_Giriş Zamanı'][i] - exams_df_gtsLabeled['S_Çıkış Zamanı'][i-1]).total_seconds() * -1  # this is the overlapping time in seconds 

exams_df_gtsLabeled

In [None]:
#Sort the tagged data according to the member and subject

exams_df_gtsLabeled.sort_values(by=['Konu','Üye'])

# name the table as df for the sake of simplicity

df = exams_df_gtsLabeled

df

In [None]:
df['Label'].value_counts()

# we have 7878 rows data and 290 of them are labeled as 1. This means that 290 members attempted gaming the system.


In [None]:
# get the gaming the system data

df.loc[df['Label'] == 1]

In [None]:
# fill with zero if Label column is not 1

df.replace("", 0, inplace=True)

In [None]:
df.head()

In [None]:
# check the label value counts

df['Label'].value_counts()

# ML SIDE

## Classification

In [None]:
#describe the latest df 

df.dtypes

In [None]:
#change object type to related datatype. Label and Overlap(secs.) are going to be int64

df['Overlap (secs.)'] = df['Overlap (secs.)'].astype('int64')

df.dtypes

In [None]:
# change the datetime type to unix time

df['S_Giriş Zamanı'] = df['S_Giriş Zamanı'].astype('int64') // 10**9
df['S_Çıkış Zamanı'] = df['S_Çıkış Zamanı'].astype('int64') // 10**9

df.dtypes

In [None]:
pd.options.display.float_format = '{:.0f}'.format

df.describe().T  # T is for transpose

In [None]:
# Shows the Distribution of GTS with respect to Class

# Subset your data to only include Label=1
df_label_1 = df[df["Label"] == 1]

sns.countplot(x="Sınıf", hue="Label", data=df_label_1)


In [None]:
# Compute the correlation matrix
corr = df.corr()

# Generate a heatmap plot of the correlation matrix using Seaborn
sns.heatmap(corr, cmap="YlGnBu")

# Show the plot
plt.show()

In [None]:
# we need to drop the columns that are highly correlated with each other to avoid multicollinearity

df.drop(["Doğru Sayısı","Yanlış Sayısı","Boş Sayısı","cum_lecture_time","cum_study_time","Overlap (secs.)"], axis=1, inplace=True)

In [None]:
# Compute the correlation matrix
corr = df.corr()

# Generate a heatmap plot of the correlation matrix using Seaborn
sns.heatmap(corr, cmap="YlGnBu")

# Show the plot
plt.show()

In [None]:
df.head()

In [None]:
# to remove the columns which has single value, we investigate the unique values of each column

print(df['Sınav Türü'].unique())
print(df['Sınıf'].unique())
print(df['Label'].unique())

we have different values on the columns so no need any extraction for unique values.

In [None]:
df.dtypes

In [None]:
df.info()

We have no null values so no need to deal with missing data. Since we are planning to use xgboost, we need only int, float and bool values. Our data is already in this format so no need to convert any data type. 

In [None]:
y_train.value_counts()

### Format the data 1 - Split data into train and test

In [None]:
# since we try to predict the Label column, we need to drop this column before assigning the data to X

X = df.drop('Label', axis=1).copy()
X.head()

In [None]:
# y is the Label column which we want to predict

y = df['Label'].copy()
y.head()

In [None]:
### Format the data 2 : One Hot Encoding 

# One Hot Encoding is not good for logistic regression but great for tree based models for categorical variables

X_encoded = pd.get_dummies(X, columns=['Üye', 'Sınav','Konu','Sınav Türü','GirişLog','Sınıf','Ders','Aktif Materyal Sayısı', 'Toplam Materyal Sayısı'])

print(X_encoded.info())

X_encoded.head()


In [None]:
# verify that y has only 0 and 1 values

y.unique()

### Build a preliminary XGBoost Model

In [None]:
# check the target data imbalance

sum(y)/len(y)

Our target variable is highly imbalanced. Thus, we use stratified sampling to split the data into train and test sets.

In [None]:
# split the data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=42, test_size=0.2, stratify=y)   #test_size=0.2 could be added to split in %80 train and %20 test 

In [None]:
# check if stratification is done correctly

print('y_train ratio:', sum(y_train)/len(y_train))
print('y_test ratio:',sum(y_test)/len(y_test))

In [None]:
len(y_test)/(len(y_test)+len(y_train))

It seems stratified sampling is working well. 


Now we use XGB Classifier and Instead of determinin the optimal number of trees, we use early stopping to determine the optimal number of trees. Early Stopping is a method that allows you to specify a performance metric to evaluate your model on every step of the training process, and stop the training process when the performance metric no longer improves for a given number of steps.

In [None]:
# Create XGBClassifier model

clf_xgb = xgb.XGBClassifier(objective='binary:logistic', seed=42)
clf_xgb.fit(X_train, y_train, verbose=True, early_stopping_rounds=10, eval_metric='aucpr', eval_set=[(X_test, y_test)]) 

In [None]:
ConfusionMatrixDisplay.from_estimator(clf_xgb, X_test, y_test, cmap='Blues', values_format='d' ,display_labels=['Not GTS', 'GTS'])

Our default XGBoost model is not performing well. We need to tune the hyperparameters to improve the performance.


### Optimize the hyperparameters using cross validation and GridSearchCV

In [None]:
# # first try

# param_grid = {
#     'max_depth': [3, 6, 9],
#     'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
#     #'n_estimators': [100, 200],
#     #'gamma': [0.0, 0.25, 0.5, 1.0],
#     #'subsample': [0.5, 0.9],
#     #'colsample_bytree': [0.3, 0.5, 0.9],
#     #'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
#     #'reg_lambda': [10, 20, 100],
#     'scale_pos_weight': [1, 3, 5]
# }

In [None]:
# since the operation takes too long, we commented out the code


# optimal_params = GridSearchCV(
#     estimator=clf_xgb,
#     param_grid=param_grid,
#     scoring='roc_auc',
#     verbose=4,
#     cv=5
# )


# optimal_params.fit(X_train, y_train)

In [None]:
# optimal_params.best_params_

In [None]:
# optimal_params.best_estimator_

In [None]:
# optimal_params.best_score_

In [None]:
# resultsdf = pd.DataFrame(optimal_params.cv_results_)
# resultsdf = resultsdf.sort_values(by=['rank_test_score'], ascending=False)
# resultsdf.head()

In [None]:
# we use xgboost model with optimized parameters.

clf_xgb = xgb.XGBClassifier(seed=42,
                            objective='binary:logistic',
                            gamma=0.25,
                            learning_rate=0.01,
                            max_depth=6,                            
                            reg_alpha=0.25,
                            reg_lambda=5,
                            scale_pos_weight=30,
                            subsample=0.9,
                            colsample_bytree=0.5,
                            n_estimators=100,
                            )

clf_xgb.fit(X_train, 
            y_train, 
            verbose=True, 
            early_stopping_rounds=10, 
            eval_metric='aucpr', 
            eval_set=[(X_test, y_test)])

In [None]:
ConfusionMatrixDisplay.from_estimator(clf_xgb, X_test, y_test, cmap='Blues', values_format='d' ,display_labels=['Not GTS', 'GTS'])

In [None]:
# buraya tree çizdir.
