In [1]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
df = pd.read_csv('BSA-dataset-synthetic.csv', sep=';')

# Transform 'Gender' column
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == 'F' else 0)
df['BSA'] = df['BSA'].apply(lambda x: 1 if x == 'PS' else 0)

df.head()
df.columns


Index(['train', 'Gender', 'Nationality', 'PreEducation', 'Program', 'Year',
       'BSA', 'Course3-1', 'Course9-1', 'Course8-1', 'Course7-1', 'Course3-R',
       'Course9-R', 'Credits-B1B2', 'Course23', 'Course26', 'Course3',
       'Course16', 'Course8', 'Course9', 'Course22', 'Course7', 'Course25',
       'Course24', 'Credits-Y1'],
      dtype='object')

In [111]:
class Course:
    def __init__(self, course_name):
        self.course_name = course_name
        self.endgrade = None
        self.first_try_grade = None
        self.resit_grade = None
        self.blok = None

        #dummies
        self.passed_Course = None
        self.firstTry_passed=None
        self.firstTry_present=None
        self.both_notPresent=None

    def set_endgrade(self, endgrade):
        self.endgrade = endgrade

    def set_first_try_grade(self, grade):
        if np.isnan(grade):
            grade = None
        self.first_try_grade = grade
        self.blok = 2

    def set_resit_grade(self, grade):
        if np.isnan(grade):
            grade = None  
        self.resit_grade = grade
        self.blok = 1

    def create_dummies(self):
        """
        self.passed_Course = False     
        self.firstTry_passed = False
        self.firstTry_present = False
        self.both_notPresent = False
        """

        self.passed_Course = False
        if self.endgrade ==None:
            return
        
        if self.endgrade >= 5.5:
            self.passed_Course = True
        else:
            self.passed_Course = False
        
        #Of te wel deze data hebben we niet
        if self.blok == None:
            return
        

        if self.blok == 1:
            self.firstTry_passed = False
            self.firstTry_present = False
            self.both_notPresent = False

            if self.first_try_grade != None:
                self.firstTry_present = True

            if self.first_try_grade != None and self.resit_grade == None and self.passed_Course ==True:
                self.firstTry_passed = True
                self.firstTry_present = True                

            if self.first_try_grade == None and self.resit_grade == None:
                self.both_notPresent = True
            

        if self.blok == 2:
            self.firstTry_passed = False
            self.firstTry_present = False


            if self.first_try_grade != None:
                self.firstTry_present = True    
            
                #dit moet dus gefixed worden
                if self.first_try_grade >= 5.5:
                    self.firstTry_passed= True 


    def __print__(self):

        string = "mooi man"
        return string
    
    def for_df_column(self):
        # Example usage:
        courses_data = {}
        
        if(self.passed_Course == None):
            return courses_data

        if self.blok == None:
            courses_data = {
                "x_"+self.course_name + "_Endgrade": self.endgrade,
                "x_"+self.course_name + "_PassedCourse": int(self.passed_Course),
            }
        elif self.blok == 1:
            courses_data = {
                "1_"+self.course_name + "_Endgrade": self.endgrade,
                "1_"+self.course_name + "_PassedCourse": int(self.passed_Course),
                "1_"+self.course_name + "_FirstTryPassed": int(self.firstTry_passed),
                "1_"+self.course_name + "_FirstTryPresent": int(self.firstTry_present),
                "1_"+self.course_name + "_Both_notPresent": int(self.both_notPresent),
            }         
        elif self.blok == 2:
            courses_data = {
                "2_"+self.course_name + "_Endgrade": self.endgrade,
                "2_"+self.course_name + "_PassedCourse": int(self.passed_Course),
                "2_"+self.course_name + "_FirstTryPassed": int(self.firstTry_passed),
                "2_"+self.course_name + "_FirstTryPresent": int(self.firstTry_present),
            }         
        return courses_data

# Create an empty dictionary to store Course objects

# Create an empty list to store DataFrames for each row
new_dfs = []

# Iterate over each row in the original DataFrame
for i in range(len(df)):
    courses = {}
    # Iterate over each column in the DataFrame
    for column in df.columns:
        # Check if the column represents a course
        if column.startswith('Course'):
            # Extract the course name
            course_name = column.split('-')[0]
            # Check if the course object exists in the dictionary
            if course_name not in courses:
                # If not, create a new Course object
                course = Course(course_name)
                courses[course_name] = course
            else:
                # If it exists, retrieve the existing Course object
                course = courses[course_name]

            # Get the endgrade for the current course from the DataFrame
            endgrade = df.loc[i, column]

            # Set the endgrade, first try grade, or resit grade based on the column name
            if '-1' in column:
                course.set_first_try_grade(endgrade)
            elif '-R' in column:
                course.set_resit_grade(endgrade)
            else:
                course.set_endgrade(endgrade)

    # Create an empty dictionary to store the data for the current row
    row_data = {}
    
    # Populate the dictionary with data for the current row
    for course_name, course in courses.items():
        course.create_dummies()

        row_data.update(course.for_df_column())


    # Create a DataFrame from the row data
    new_df = pd.DataFrame([row_data])

    # Append the new DataFrame to the list
    new_dfs.append(new_df)

# Concatenate all DataFrames in the list to create the final DataFrame
final_df = pd.concat(new_dfs, ignore_index=True)

# Print the final DataFrame
final_df.columns


Index(['1_Course3_Endgrade', '1_Course3_PassedCourse',
       '1_Course3_FirstTryPassed', '1_Course3_FirstTryPresent',
       '1_Course3_Both_notPresent', '1_Course9_Endgrade',
       '1_Course9_PassedCourse', '1_Course9_FirstTryPassed',
       '1_Course9_FirstTryPresent', '1_Course9_Both_notPresent',
       '2_Course8_Endgrade', '2_Course8_PassedCourse',
       '2_Course8_FirstTryPassed', '2_Course8_FirstTryPresent',
       '2_Course7_Endgrade', '2_Course7_PassedCourse',
       '2_Course7_FirstTryPassed', '2_Course7_FirstTryPresent',
       'x_Course23_Endgrade', 'x_Course23_PassedCourse', 'x_Course26_Endgrade',
       'x_Course26_PassedCourse', 'x_Course16_Endgrade',
       'x_Course16_PassedCourse', 'x_Course22_Endgrade',
       'x_Course22_PassedCourse', 'x_Course25_Endgrade',
       'x_Course25_PassedCourse', 'x_Course24_Endgrade',
       'x_Course24_PassedCourse'],
      dtype='object')

In [115]:
# Filter columns in df that do not contain the substring 'Course'
non_course_columns_df = df.loc[:, ~df.columns.str.contains('Course')]

# Concatenate the non-course columns from the original DataFrame with the final DataFrame
merged_df = pd.concat([non_course_columns_df, final_df], axis=1)

# Print the merged DataFrame
merged_df

Unnamed: 0,train,Gender,Nationality,PreEducation,Program,Year,BSA,Credits-B1B2,Credits-Y1,1_Course3_Endgrade,...,x_Course26_Endgrade,x_Course26_PassedCourse,x_Course16_Endgrade,x_Course16_PassedCourse,x_Course22_Endgrade,x_Course22_PassedCourse,x_Course25_Endgrade,x_Course25_PassedCourse,x_Course24_Endgrade,x_Course24_PassedCourse
0,1,1,Nederland,Vwo,Program1,21/22,0,0,0,2.5,...,,0,,0,,0,,0,,0
1,2,0,Nederland,Vwo,Program1,21/22,0,0,0,3.5,...,5.0,0,,0,4.0,0,2.0,0,4.0,0
2,3,0,Azie,Buitenlands,Program1,21/22,0,0,0,2.5,...,,0,,0,,0,,0,,0
3,4,1,Nederland,Vwo,Program1,21/22,0,0,0,5.0,...,,0,,0,2.5,0,,0,,0
4,5,0,EU,Buitenlands,Program1,21/22,0,0,0,,...,,0,,0,,0,,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,57,0,EU,Buitenlands,Program1,22/23,1,24,60,8.0,...,8.0,1,8.0,1,7.0,1,7.5,1,9.5,1
57,58,1,Nederland,Vwo,Program1,22/23,0,12,36,6.5,...,,0,6.0,1,6.0,1,3.5,0,7.0,1
58,59,1,Nederland,Vwo,Program1,22/23,1,18,48,6.5,...,8.0,1,6.5,1,6.0,1,8.5,1,7.5,1
59,60,1,Nederland,Vwo,Program1,22/23,1,18,54,6.5,...,6.0,1,6.0,1,8.0,1,6.5,1,7.5,1


In [107]:
# Save the merged DataFrame to an Excel file
merged_df.to_excel('merged_data.xlsx', index=False)

# Print a message to confirm the file has been saved
print("Merged data has been saved to 'merged_data.xlsx")

Merged data has been saved to 'merged_data.xlsx


In [121]:
merged_df["1_Course3_FirstTryPresent"]

0     1
1     1
2     1
3     1
4     0
     ..
56    1
57    1
58    1
59    1
60    1
Name: 1_Course3_FirstTryPresent, Length: 61, dtype: int64

In [130]:
import statsmodels.api as sm

# Define the predictors (independent variables)
predictors = ['1_Course3_PassedCourse',
              '1_Course3_FirstTryPassed',
              '1_Course3_Both_notPresent',
              '1_Course9_PassedCourse', '1_Course9_FirstTryPassed',
              '1_Course9_Both_notPresent',
              '2_Course8_FirstTryPassed',
               
              '2_Course7_FirstTryPassed']

# Add a constant term for the intercept
X = sm.add_constant(merged_df[predictors])

# Define the target variable
y = merged_df['Credits-Y1']
model = sm.OLS(y, X)

# y = merged_df['BSA']
# model = sm.Logit(y, X)

result = model.fit()
print(result.summary())


# # Fit the Ridge regression model with a minimum coefficient of 6
# ridge_model = sm.OLS(y, X).fit_regularized(alpha=0.1, L1_wt=0, )

# # Print the summary of the regression results
# # Print the coefficients
# print("Coefficients:")
# print(ridge_model.params)

# # Print additional statistics such as standard errors and p-values
# print("\nAdditional statistics:")
# print(ridge_model.summary2())

                            OLS Regression Results                            
Dep. Variable:             Credits-Y1   R-squared:                       0.794
Model:                            OLS   Adj. R-squared:                  0.762
Method:                 Least Squares   F-statistic:                     25.04
Date:                Thu, 11 Apr 2024   Prob (F-statistic):           2.76e-15
Time:                        18:36:40   Log-Likelihood:                -223.67
No. Observations:                  61   AIC:                             465.3
Df Residuals:                      52   BIC:                             484.3
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 