# Predicting student learning performance in a Virtual Learning Environment using activity data

In [None]:
!easy_install ibm_db

***Important Note***  if the command doesn't work on your computer, please use this Github issue here: https://github.com/ibmdb/python-ibmdb/issues/276

In [None]:
import ibm_db
import ibm_db_dbi
import seaborn as sns
import pandas as pd
import numpy as np

## A cleaner way to import the data while preserving table names

In [None]:
schema_name = '<schema_name>'
table_names = ['STUDENT_INFO', 'ASSESSMENTS','COURSES','VLE','STUDENT_ASSESSMENT','STUDENT_REGISTRATION','STUDENT_VLE2']

In [None]:
# replace only <> credentials
dsn = "DRIVER={{IBM DB2 ODBC DRIVER}};" + \
      "DATABASE=<DATABASE>;" + \
      "HOSTNAME=<HOSTNAME>;" + \
      "PORT=50000;" + \
      "PROTOCOL=TCPIP;" + \
      "UID=<UID>;" + \
      "PWD=<PWD>;"

hdbc = None
hdbi = None
try:
    hdbc  = ibm_db.connect(dsn, "", "")
    hdbi = ibm_db_dbi.Connection(hdbc)
    print('Connection Established!')
except Exception:
    print('Error in Connecting to Database')

if hdbc is None:
    print("\nERROR: Unable to connect to the database.")
    print("Connection string used: " + dsn + "\n")

In [None]:
lop_tables = dict()
for table_name in table_names:
    print('Loading Data From ' + table_name+ '...')
    sql = "SELECT * FROM "+schema_name+"." + table_name
    data_df = pd.read_sql(sql,hdbi)    
    df = pd.DataFrame(data = data_df)
    lop_tables[table_name] = df

In [None]:
lop_tables['STUDENT_INFO'].describe()

In [None]:
lop_tables['STUDENT_INFO'].info()

In [None]:
lop_tables['STUDENT_INFO']['final_result'].describe()

In [None]:
lop_tables['STUDENT_INFO']['final_result'].value_counts()

In [None]:
sns.barplot(lop_tables['STUDENT_INFO']['final_result'].value_counts())

In [None]:
lop_tables['STUDENT_INFO'].head()

## Can we use VLE activities to predict a student's final result?

In [None]:
sns.catplot(x = "final_result", y = "studied_credits", data = lop_tables['STUDENT_INFO'])

In [None]:
lop_tables['STUDENT_ASSESSMENT'].head()

In [None]:
lop_tables['STUDENT_ASSESSMENT']['score'].describe()

In [None]:
lop_tables['STUDENT_INFO'].info()

In [None]:
lop_tables['STUDENT_VLE2'].info()

## Joining the datasets to merge related data

We use the pandas merge function to join the data from the required dataframes to form one large supertable.

In [None]:
student_full_vle = pd.merge(lop_tables['STUDENT_INFO'], lop_tables['STUDENT_VLE2'], on='id_student', how='inner')

In [None]:
student_full_vle_details = pd.merge(student_full_vle, lop_tables['VLE'], on = 'id_site', how = 'inner')

In [None]:
student_full_vle.tail()

In [None]:
lop_tables['VLE'].info()

In [None]:
student_full_vle_details.head()

Group by id_student and code_module_x + code_presentation_x

In [None]:
student_full_vle_details.info()

In [None]:
student_full_vle_details['final_result'] = student_full_vle_details.final_result.astype(str)

In [None]:
student_full_vle_details.info()

In [None]:
sns.catplot(x = 'final_result', y = 'sum_click', data = student_full_vle_details, kind='bar')

In [None]:
plt = sns.catplot(x = 'activity_type', kind = 'count', data = student_full_vle_details)
plt.set_xticklabels(rotation = 45)

An inspection of the midsection of the dataset

In [None]:
# sns.distplot(student_full_vle_details.activity_type)
student_full_vle_details[500000:500005]

Checking for null data in each column

In [None]:
student_full_vle_details.apply(lambda x: sum(x.isnull()), axis=0)

Looks good. Time to encode the categorical data columns:

In [None]:
from sklearn.preprocessing import LabelEncoder



In [None]:
cat_feature_names = ['gender', 'region', 'activity_type', 'disability', 'age_band', 'imd_band', 'highest_education', 'code_module', 'code_presentation']
le = LabelEncoder()
for f in cat_feature_names:
    student_full_vle_details[f] = le.fit_transform(student_full_vle_details[f])
student_full_vle_details.dtypes

In [None]:
import numpy as np

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

def classification_model(model, data, predictors, outcome):
    model.fit(data[predictors], data[outcome])
    predictions = model.predict(data[predictors])
    
    accuracy = metrics.accuracy_score(predictions, data[outcome])
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))
    # kf = KFold(data.shape[0],n_splits = 5)    
    kf = KFold(n_splits = 5)
    error = []
    for train, test in kf.split(data):
        train_predictors = (data[predictors].iloc[train, :])
        train_target = data[outcome].iloc[train]
        model.fit(train_predictors, train_target)
        error.append(model.score(data[predictors].iloc[test, :], data[outcome].iloc[test]))
        
    print("Cross validation score : %s" % "{0:.3%}".format(np.mean(error)))
    model.fit(data[predictors], data[outcome])

In [None]:
outcome_var = 'final_result'
model = LogisticRegression()
predictor_var = ['id_site','sum_click','activity_type','studied_credits','highest_education']
classification_model(model, student_full_vle_details, predictor_var, outcome_var)

In [None]:
model = DecisionTreeClassifier()
classification_model(model, student_full_vle_details, predictor_var, outcome_var)

In [None]:
model = RandomForestClassifier(n_estimators=100)
predictor_var = ['imd_band', 'age_band', 'num_of_prev_attempts','studied_credits','code_module','code_presentation','activity_type','sum_click','id_site']
classification_model(model, student_full_vle_details, predictor_var, outcome_var)


In [None]:
model = RandomForestClassifier(n_estimators=30, max_depth=5)
classification_model(model, student_full_vle_details, predictor_var, outcome_var)