In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
#lets load the dataset and clean it
data = pd.read_csv("/content/DataSet.csv")
data.head()

Unnamed: 0,السنة,الشهر,الفئة العمرية,عدد معتمري الخارج,Unnamed: 4
0,1440,جمادى الآخرة,21 - 30,94768,
1,1440,جمادى الآخرة,31 - 40,144485,
2,1440,جمادى الآخرة,41 - 50,167322,
3,1440,جمادى الآخرة,51 - 60,208814,
4,1440,جمادى الآخرة,60'اكبر من,258698,


In [3]:
#we have NaN column, we will drop it to clean the data
data.drop(columns=["Unnamed: 4"], inplace=True)

In [4]:
data.head()

Unnamed: 0,السنة,الشهر,الفئة العمرية,عدد معتمري الخارج
0,1440,جمادى الآخرة,21 - 30,94768
1,1440,جمادى الآخرة,31 - 40,144485
2,1440,جمادى الآخرة,41 - 50,167322
3,1440,جمادى الآخرة,51 - 60,208814
4,1440,جمادى الآخرة,60'اكبر من,258698


In [5]:
#converting the column to number to deal with it
data["عدد معتمري الخارج"] = data["عدد معتمري الخارج"].str.replace(',', '').astype(int)
#

In [6]:
#Feature engineering to create a binary classification for the logistic regression
median= data['عدد معتمري الخارج'].median()


In [7]:
#now we will create new columan called High_Low_Pilgrims
data['High_Low_Pilgrims'] = (data['عدد معتمري الخارج'] > median).astype(int)#to deal with it as 0 or 1
data.head(100)

Unnamed: 0,السنة,الشهر,الفئة العمرية,عدد معتمري الخارج,High_Low_Pilgrims
0,1440,جمادى الآخرة,21 - 30,94768,1
1,1440,جمادى الآخرة,31 - 40,144485,1
2,1440,جمادى الآخرة,41 - 50,167322,1
3,1440,جمادى الآخرة,51 - 60,208814,1
4,1440,جمادى الآخرة,60'اكبر من,258698,1
...,...,...,...,...,...
95,1441,رجب,اصغر من 20,4162,0
96,1441,صفر,21 - 30,64345,0
97,1441,صفر,31 - 40,94975,1
98,1441,صفر,41 - 50,108293,1


In [8]:
data.head(100)

Unnamed: 0,السنة,الشهر,الفئة العمرية,عدد معتمري الخارج,High_Low_Pilgrims
0,1440,جمادى الآخرة,21 - 30,94768,1
1,1440,جمادى الآخرة,31 - 40,144485,1
2,1440,جمادى الآخرة,41 - 50,167322,1
3,1440,جمادى الآخرة,51 - 60,208814,1
4,1440,جمادى الآخرة,60'اكبر من,258698,1
...,...,...,...,...,...
95,1441,رجب,اصغر من 20,4162,0
96,1441,صفر,21 - 30,64345,0
97,1441,صفر,31 - 40,94975,1
98,1441,صفر,41 - 50,108293,1


In [20]:
# Encoding categorical variables
categorical_features = ['الشهر', 'الفئة العمرية']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")


In [39]:
# Transforming data
transformed_variables = transformer.fit_transform(data.drop('High_Low_Pilgrims', axis=1))
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(transformed_variables, data['High_Low_Pilgrims'], test_size=0.2 , random_state= 42)


In [40]:
#you can get more info about Logistic Regression from here https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)


In [41]:
y_pred = logistic_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.99

Confusion Matrix:
[[31  1]
 [ 0 39]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        32
           1       0.97      1.00      0.99        39

    accuracy                           0.99        71
   macro avg       0.99      0.98      0.99        71
weighted avg       0.99      0.99      0.99        71

