In [83]:
pip install sklearn-pandas

Note: you may need to restart the kernel to use updated packages.


In [230]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

In [267]:
df = pd.read_csv('StudentPerformanceFactors.csv', engine='python')

In [268]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [269]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


Заполним пустые значения и трансформируем данные для возможности машинного обучения.

In [270]:
for column in ['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home']:
    df[column] = df[column].fillna(df[column].mode()[0])

In [271]:
df = shuffle(df)

In [272]:
le = LabelEncoder()

for column in ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 
               'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 
                 'Gender']:
    df[column] = le.fit_transform(df[column])

In [273]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
343,17,78,2,0,0,8,88,2,1,0,1,2,1,0,3,0,1,2,0,65
6046,20,89,2,2,1,8,51,2,1,3,2,0,1,2,3,0,2,2,0,70
1329,24,89,2,1,1,6,94,1,1,2,2,2,1,1,2,0,1,0,1,68
4402,15,92,1,2,0,8,69,2,1,1,0,2,0,1,4,0,2,2,1,68
5956,19,66,2,2,0,8,83,2,1,2,2,0,1,1,3,0,1,2,0,65


Разделим датасет на 2 датасета. Первый используем для обучения на основе правил. Из второго возьмем 100 случайных строк, загрузим в csv-файл и используем для ручной разметки.

In [274]:
labed_df, unlabed_df = train_test_split(df, train_size=0.1, random_state=50)

In [275]:
labed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 660 entries, 6458 to 91
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   Hours_Studied               660 non-null    int64
 1   Attendance                  660 non-null    int64
 2   Parental_Involvement        660 non-null    int32
 3   Access_to_Resources         660 non-null    int32
 4   Extracurricular_Activities  660 non-null    int32
 5   Sleep_Hours                 660 non-null    int64
 6   Previous_Scores             660 non-null    int64
 7   Motivation_Level            660 non-null    int32
 8   Internet_Access             660 non-null    int32
 9   Tutoring_Sessions           660 non-null    int64
 10  Family_Income               660 non-null    int32
 11  Teacher_Quality             660 non-null    int32
 12  School_Type                 660 non-null    int32
 13  Peer_Influence              660 non-null    int32
 14  Physical_Acti

In [276]:
unlabed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5947 entries, 4259 to 1092
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   Hours_Studied               5947 non-null   int64
 1   Attendance                  5947 non-null   int64
 2   Parental_Involvement        5947 non-null   int32
 3   Access_to_Resources         5947 non-null   int32
 4   Extracurricular_Activities  5947 non-null   int32
 5   Sleep_Hours                 5947 non-null   int64
 6   Previous_Scores             5947 non-null   int64
 7   Motivation_Level            5947 non-null   int32
 8   Internet_Access             5947 non-null   int32
 9   Tutoring_Sessions           5947 non-null   int64
 10  Family_Income               5947 non-null   int32
 11  Teacher_Quality             5947 non-null   int32
 12  School_Type                 5947 non-null   int32
 13  Peer_Influence              5947 non-null   int32
 14  Physical_A

In [277]:
df_sample = unlabed_df.sample(n=100, random_state=42)

In [278]:
df_rest = unlabed_df.drop(df_sample.index)

In [279]:
df_sample.to_csv('labed_df.csv')

In [280]:
!label-studio

^C


Прочитаем файл с размеченными вручную данными. Переименуем, что нужно, удалим, что не нужно.

In [281]:
df_labeled = pd.read_csv('project7.csv', engine='python')

In [282]:
df_labeled.rename(columns={'choice': 'Choice'}, inplace=True)

In [283]:
df_labeled.drop(['updated_at', 'lead_time', 'created_at', 'annotator', 'annotation_id', 'id', 'Unnamed: 0'], axis=1, inplace=True)

К первому тестовому датасету применим правило.

In [302]:
def rule_score(row):
    if row['Exam_Score'] >= 75:
        return 'good'
    else:
        return 'bad'

labed_df['Choice'] = labed_df.apply(rule_score, axis=1)

In [286]:
labed_df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,...,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score,Choice
6458,16,84,0,0,0,6,88,1,1,1,...,2,0,0,1,1,1,2,1,66,bad
3274,15,91,1,2,0,6,64,1,1,1,...,0,1,2,3,1,1,2,1,64,bad
4342,32,85,2,1,1,9,63,1,0,2,...,2,1,1,3,0,1,1,1,69,bad
5189,30,87,2,2,0,8,53,1,1,1,...,0,0,1,1,0,2,2,1,70,bad
5981,30,88,1,2,1,6,90,0,1,4,...,2,0,0,2,0,0,1,1,73,bad


Объединим оба размеченных датасета.

In [287]:
df_combined = pd.concat([labed_df, df_labeled])

In [288]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 760 entries, 6458 to 99
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               760 non-null    int64 
 1   Attendance                  760 non-null    int64 
 2   Parental_Involvement        760 non-null    int64 
 3   Access_to_Resources         760 non-null    int64 
 4   Extracurricular_Activities  760 non-null    int64 
 5   Sleep_Hours                 760 non-null    int64 
 6   Previous_Scores             760 non-null    int64 
 7   Motivation_Level            760 non-null    int64 
 8   Internet_Access             760 non-null    int64 
 9   Tutoring_Sessions           760 non-null    int64 
 10  Family_Income               760 non-null    int64 
 11  Teacher_Quality             760 non-null    int64 
 12  School_Type                 760 non-null    int64 
 13  Peer_Influence              760 non-null    int64 
 1

In [289]:
df_combined.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,...,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score,Choice
6458,16,84,0,0,0,6,88,1,1,1,...,2,0,0,1,1,1,2,1,66,bad
3274,15,91,1,2,0,6,64,1,1,1,...,0,1,2,3,1,1,2,1,64,bad
4342,32,85,2,1,1,9,63,1,0,2,...,2,1,1,3,0,1,1,1,69,bad
5189,30,87,2,2,0,8,53,1,1,1,...,0,0,1,1,0,2,2,1,70,bad
5981,30,88,1,2,1,6,90,0,1,4,...,2,0,0,2,0,0,1,1,73,bad


Разделим данные на обучающий и тестовый наборы. Обучим модель логистической регрессии.

In [290]:
X = df_combined[['Hours_Studied', 'Attendance', 'Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours', 
                 'Previous_Scores', 'Motivation_Level', 'Internet_Access', 'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
                 'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 
                 'Gender', 'Exam_Score']]

In [291]:
y = df_combined['Choice']

In [292]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [296]:
model = LogisticRegression(max_iter=1000)


In [297]:
model.fit(X_train, y_train)

Предскажем результаты на тестовом наборе и вычислим точность модели.

In [298]:
y_pred = model.predict(X_test)

In [299]:
accuracy = accuracy_score(y_test, y_pred)
print('Точность модели:', accuracy)

Точность модели: 1.0


Точность модели 100%, что логично, задача была простая. В данном конкретном случае, отдаю предпочтение обучению на основе правил, так как они четко определены и не несут риска человеческого фактора, в отличае от ручной разметки.