
#**FEATURETOOLS**

**Import Libraries**

In [None]:
# Install featuretools
!pip install featuretools

import featuretools as ft
import pandas as pd


Collecting featuretools
  Downloading featuretools-1.28.0-py3-none-any.whl (619 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/619.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/619.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m542.7/619.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.2/619.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting holidays<0.33,>=0.13 (from featuretools)
  Downloading holidays-0.32-py3-none-any.whl (754 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m754.4/754.4 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting woodwork>=0.23.0 (from featuretools)
  Downloading woodwork-0.27.0-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23

**Import CSV File**

In [None]:
# Mount drive file and import
from google.colab import drive
drive.mount('gdrive')

#path to csv file
csv_file_path = '/content/gdrive/MyDrive/3007 Group assignment/sampled_data.csv'

# Use pandas to read the CSV file
data = pd.read_csv(csv_file_path)

Mounted at gdrive


**Define Entities**

In [None]:
# Define entities
student_entity = (data[['MaritalStatus', 'Nationality', 'Gender', 'ScholarshipHolder', 'International',
                        'DaytimeEveningAttendance','Debtor','PreviousQualification',
                        'PreviousQualificationGrade',  'MotherQualification','FatherQualification',
                        'MotherOccupation', 'FatherOccupation', 'Displaced',
                        'Student_ID']], 'Student_ID')


enrollment_entity = (data[['AdmissionGrade', 'EducationalSpecialNeeds',
                            'TuitionFeesUpToDate','AgeAtEnrollment','Student_ID', 'Enrollment_ID']], 'Enrollment_ID')


performance_entity = (data[['CurricularUnits1stSemCredited',
       'CurricularUnits1stSemEnrolled', 'CurricularUnits1stSemEvaluation',
       'CurricularUnits1stSemApproved', 'CurricularUnits1stSemGrade',
       'CurricularUnits1stSemNoEvaluations', 'CurricularUnits2ndSemCredited',
       'CurricularUnits2ndSemEnrolled', 'CurricularUnits2ndSemEvaluations',
       'CurricularUnits2ndSemApproved', 'CurricularUnits2ndSemGrade',
       'CurricularUnits2ndSemWithoutEvaluations','UnemploymentRate',
                          'InflationRate', 'GDP', 'Target', 'Performance_ID','Student_ID']], 'Performance_ID')


course_entity = (data[[ 'Course','Course_ID','Student_ID']], 'Course_ID')

application_entity = (data[[ 'ApplicationMode', 'ApplicationOrder', 'Application_ID','Student_ID']], 'Application_ID')

**Create Entity Set And Add Dataframe**

In [None]:
# Create an EntitySet
es = ft.EntitySet(id="student_data")


# Add dataframes to the EntitySet
es = es.add_dataframe(dataframe_name="student", dataframe=student_entity[0], index='Student_ID')

es = es.add_dataframe(dataframe_name="enrollment", dataframe=enrollment_entity[0], index='Enrollment_ID')

es = es.add_dataframe(dataframe_name="student_performance", dataframe=performance_entity[0], index='Performance_ID')

es = es.add_dataframe(dataframe_name="course", dataframe=course_entity[0], index='Course_ID')

es = es.add_dataframe(dataframe_name="application", dataframe=application_entity[0], index='Application_ID')

**Establish Relationship**

In [None]:
#Establish relationship
relationship1 = es.add_relationship(
    parent_dataframe_name='student',
    parent_column_name='Student_ID',
    child_dataframe_name='student_performance',
    child_column_name='Student_ID'
)

relationship2 = es.add_relationship(
    parent_dataframe_name='student',
    parent_column_name='Student_ID',
    child_dataframe_name='enrollment',
    child_column_name='Student_ID'
)

relationship3 = es.add_relationship(
    parent_dataframe_name='student',
    parent_column_name='Student_ID',
    child_dataframe_name='course',
    child_column_name='Student_ID'
)

relationship4 = es.add_relationship(
    parent_dataframe_name='student',
    parent_column_name='Student_ID',
    child_dataframe_name='application',
    child_column_name='Student_ID'
)


# Print the EntitySet
print(es)

Entityset: student_data
  DataFrames:
    student [Rows: 1327, Columns: 15]
    enrollment [Rows: 1327, Columns: 6]
    student_performance [Rows: 1327, Columns: 18]
    course [Rows: 1327, Columns: 3]
    application [Rows: 1327, Columns: 4]
  Relationships:
    student_performance.Student_ID -> student.Student_ID
    enrollment.Student_ID -> student.Student_ID
    course.Student_ID -> student.Student_ID
    application.Student_ID -> student.Student_ID


**Deep Feature Synthesis**

In [None]:
# SET PANDAS DISPLAY OPTIONS TO SHOW ALL COLUMNS AND ROWS
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


# DEEP FEATURE SYNTHESIS
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="student_performance",
    verbose=True,
    max_depth=2
)

feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="enrollment",
    verbose=True,
    max_depth=2
)



Built 169 features
Elapsed: 00:06 | Progress: 100%|██████████
Built 157 features
Elapsed: 00:05 | Progress: 100%|██████████


In [None]:
# PRINT THE GENERATED FEATURE MATRIX
print(feature_matrix)
print(feature_defs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

