# Feature Engineering

# Overview
This file contains code cells that extract data from students' cogs18 final projects. All data were collected with consents, anonymized, and processed ethically.

# Setup

In [1]:
# general imports

import numpy as np                  
import pandas as pd                 
import os
import shutil
import re

In [2]:
# initialize the desired dataframe

column_names = ["project_name", 
                "lines_code", 
                "num_imports", 
                "imports", 
                "lines_comments", 
                "num_comments_words",
                "num_functions", 
                "num_classes",
                "num_tests",
                "num_asserts",
                "num_logic",
                "num_loops"]

df = pd.DataFrame(columns=column_names)

# Conversion from .py to .txt
These code cells take the scripts part of student's final project (.py) and convert them to plain text files (.txt) so they can be easily handled later.

In [3]:
# functions used for conversions

def create_txt_copy(py_path):
    file_name_idx = py_path.rfind("/")
    file_directory = py_path[:file_name_idx+1]
    file_name_py = py_path[file_name_idx+1:]
    file_name_txt = file_name_py[:-3] + ".txt"
    
    # copying
    original = py_path
    target = file_directory + file_name_txt
    shutil.copyfile(original, target)
    return

In [4]:
# iterate through the project folders to categorize paths and organize intermediate data structures

dict_for_df = {}
dict_for_paths = {}
list_project_names = []

file_directory = "/Users/haoyangguo/Documents/GitHub/cogs18_projects/00_feature_engineering/data/cogs18_projects"
project_folder_list = os.listdir(file_directory)

for project_name in project_folder_list:
    if (project_name == ".DS_Store"):
        continue
        
    list_project_names.append(project_name)
    
    dict_for_df[project_name] = {"project_name":project_name, 
                "lines_code":0, 
                "num_imports":0, 
                "imports":[], 
                "lines_comments":0, 
                "num_comments_words":0,
                "num_functions":0, 
                "num_classes":0,
                "num_tests":0,
                "num_asserts":0,
                "num_logic":0,
                "num_loops":0,
                }
                                      
    dict_for_paths[project_name] = {".txt":[],
                                    ".ipynb":[],
                                    }
    
    for project_content_layer1 in os.listdir(file_directory + '/' + project_name):
        path_layer1 = file_directory + '/' + project_name + '/' + project_content_layer1
        
        if os.path.isfile(path_layer1):
            
            if path_layer1.endswith(".py"):
                ((dict_for_paths[project_name])[".txt"]).append(path_layer1[:-3] + ".txt")
                create_txt_copy(path_layer1)
                
            elif path_layer1.endswith(".ipynb"):
                ((dict_for_paths[project_name])[".ipynb"]).append(path_layer1)
                
        elif os.path.isdir(path_layer1):
            for project_content_layer2 in os.listdir(path_layer1):
                path_layer2 = path_layer1 + "/" + project_content_layer2
                
                if os.path.isfile(path_layer2):
                    
                    if path_layer2.endswith(".py"):
                        ((dict_for_paths[project_name])[".txt"]).append(path_layer2[:-3] + ".txt")
                        create_txt_copy(path_layer2)
                        
                    elif path_layer1.endswith(".ipynb"):
                        ((dict_for_paths[project_name])[".ipynb"]).append(path_layer2)
                
                elif os.path.isdir(path_layer2):
                    for project_content_layer3 in os.listdir(path_layer2):
                        path_layer3 = path_layer2 + "/" + project_content_layer3
                        
                        if os.path.isfile(path_layer3):
                    
                            if path_layer3.endswith(".py"):
                                ((dict_for_paths[project_name])[".txt"]).append(path_layer3[:-3] + ".txt")
                                create_txt_copy(path_layer3)
                        
                            elif path_layer3.endswith(".ipynb"):
                                ((dict_for_paths[project_name])[".ipynb"]).append(path_layer3)
                        
                        elif os.path.isdir(path_layer3):
                            for project_content_layer4 in os.listdir(path_layer3):
                                path_layer4 = path_layer3 + "/" + project_content_layer4
                        
                                if os.path.isfile(path_layer4):
                    
                                    if path_layer4.endswith(".py"):
                                        ((dict_for_paths[project_name])[".txt"]).append(path_layer4[:-3] + ".txt")
                                        create_txt_copy(path_layer4)
                        
                                    elif path_layer4.endswith(".ipynb"):
                                        ((dict_for_paths[project_name])[".ipynb"]).append(path_layer4)
                        
                        

# Extracting data from the .txt files
These following code cells read through all the .txt files in order to extract desired data

In [5]:
# functions to extract data

def removing_leading_whitespaces(text):
     return re.sub(r"^\s+","",text)

def count_words(line):
    line_list = line.split()
    return len(line_list)

def get_import(line):
    line_list = line.split()
    return line_list[1]

def extract_lines(project_name, lines):
    lines_code = 0
    num_imports = 0
    imports = []
    lines_comments = 0
    num_comments_words = 0
    num_functions = 0
    num_classes = 0
    num_logic = 0
    num_loops = 0
    
    in_comment = False
    num_overall_lines = 0
    
    for line in lines:
        if (len(line.strip()) == 0):
            continue
        else:
            num_overall_lines+=1
            line_clean = removing_leading_whitespaces(line)
            split_line = line_clean.split()
            
            if ("#" == line_clean[0]):
                lines_comments+=1
                num_comments_words += count_words(line_clean)
                
            elif ("'''" in line_clean or '"""' in line_clean):
                pattern1 = '"""'
                pattern2 = "'''"
                
                if len(re.findall(pattern1, line_clean)) == 2 or len(re.findall(pattern2, line_clean)) == 2:
                    lines_comments+=1
                    num_comments_words += count_words(line_clean)
                
                elif (in_comment == False):
                    lines_comments+=1
                    num_comments_words += count_words(line_clean)
                    in_comment = True
                elif (in_comment == True):
                    lines_comments+=1
                    num_comments_words += count_words(line_clean)
                    in_comment = False
                
                    
            elif (in_comment == True):
                lines_comments+=1
                num_comments_words += count_words(line_clean)
                
            
            elif ("import" == split_line[0] or "from" in split_line[0]):
                num_imports+=1
                imports.append(get_import(line_clean))
                
            
            elif ("def" == split_line[0]):
                num_functions+=1
                
                
            elif ('class' == split_line[0]):
                num_classes+=1
            
            elif (split_line[0] == "if"):
                num_logic+=1
                
            elif (split_line[0] == "for") and ((split_line[-1])[-1] == ":"):
                num_loops+=1
                
                
        
    lines_code = num_overall_lines - lines_comments
    
    # direct into the dict
    variable_list = [lines_code, num_imports, imports, lines_comments, 
                     num_comments_words, num_functions, num_classes, num_logic, num_loops]
    key_list = ["lines_code", "num_imports", "imports", "lines_comments", 
                     "num_comments_words", "num_functions", "num_classes", "num_logic", "num_loops"]
    for idx in range(len(variable_list)):
        (dict_for_df[project_name])[key_list[idx]] += variable_list[idx]
    
    return

def test_extract_lines(project_name, lines):
    num_tests = 0
    num_asserts = 0
    for line in lines:
        line_clean = removing_leading_whitespaces(line)
        split_line = line_clean.split()
        
        if (len(line.strip()) == 0):
            continue
        else:
            if (split_line[0] == 'def'):
                num_tests+=1
            elif (split_line[0] == 'assert'):
                num_asserts+=1
    
    (dict_for_df[project_name])['num_tests'] += num_tests
    (dict_for_df[project_name])['num_asserts'] += num_asserts
    
    return

def read_txt(project_name):
    for path in (dict_for_paths[project_name])[".txt"]:

            if (not("test" in path)):
                with open(path) as txt_file:
                    lines = txt_file.readlines()
                    extract_lines(project_name, lines)
            else:
                with open(path) as test_txt_file:
                    lines = test_txt_file.readlines()
                    test_extract_lines(project_name, lines)
    return
    
    
    

In [6]:
# complete storing data into the dict

for project_name in list_project_names:
    read_txt(project_name)

# Filling in data into the dataframe
These following code cells iterate through the intermediate data structures and direct desired data into the existing dataframe

In [7]:
# append dictionaries to the empty dataframe

for key in dict_for_df:
    df = df.append(dict_for_df[key], ignore_index=True)

In [8]:
df.dtypes

project_name          object
lines_code            object
num_imports           object
imports               object
lines_comments        object
num_comments_words    object
num_functions         object
num_classes           object
num_tests             object
num_asserts           object
num_logic             object
num_loops             object
dtype: object

In [9]:
# change data types

df[["lines_code", "num_imports", "lines_comments", "num_comments_words","num_functions", "num_classes","num_tests","num_asserts","num_logic","num_loops"]] = df[["lines_code", "num_imports", "lines_comments", "num_comments_words","num_functions", "num_classes","num_tests","num_asserts","num_logic","num_loops"]].astype(int) 

In [10]:
# peek the dataframe

df.head(10)

Unnamed: 0,project_name,lines_code,num_imports,imports,lines_comments,num_comments_words,num_functions,num_classes,num_tests,num_asserts,num_logic,num_loops
0,SP21_B00_006,8,2,"[sys, my_module.functions]",9,55,2,0,2,2,0,0
1,SP21_B00_001,22,3,"[numpy, pandas, matplotlib.pyplot]",44,218,3,0,3,7,1,0
2,SP21_B00_008,60,8,"[numpy, pandas, sklearn.linear_model, matplotl...",72,472,5,1,3,11,1,5
3,WI21_A00_033,5,0,[],3,8,0,0,4,15,0,0
4,WI21_A00_034,61,0,[],7,66,7,0,7,14,4,2
5,WI21_A00_002,73,5,"[numpy, pandas, sklearn.linear_model, matplotl...",84,426,6,0,0,0,0,0
6,WI21_A00_005,109,2,"[random, string]",43,337,9,1,1,20,6,4
7,SP21_B00_009,114,4,"[random, random, functions, sys]",67,418,5,1,4,15,7,3
8,SP21_B00_007,50,0,[],66,382,5,1,4,17,1,3
9,WI21_A00_004,69,4,"[numpy, matplotlib.pyplot, pandas, seaborn]",57,382,4,0,4,6,4,1


In [11]:
# create a new categorical feature (quarter)

def label_quarter(row):
    project_quarter = (row["project_name"])[:4]
    return project_quarter

df["project_quarter"] = df.apply (lambda row: label_quarter(row), axis=1)

In [12]:
# double check

df.head()

Unnamed: 0,project_name,lines_code,num_imports,imports,lines_comments,num_comments_words,num_functions,num_classes,num_tests,num_asserts,num_logic,num_loops,project_quarter
0,SP21_B00_006,8,2,"[sys, my_module.functions]",9,55,2,0,2,2,0,0,SP21
1,SP21_B00_001,22,3,"[numpy, pandas, matplotlib.pyplot]",44,218,3,0,3,7,1,0,SP21
2,SP21_B00_008,60,8,"[numpy, pandas, sklearn.linear_model, matplotl...",72,472,5,1,3,11,1,5,SP21
3,WI21_A00_033,5,0,[],3,8,0,0,4,15,0,0,WI21
4,WI21_A00_034,61,0,[],7,66,7,0,7,14,4,2,WI21


In [13]:
# general overview of the dataframe

df.describe()

Unnamed: 0,lines_code,num_imports,lines_comments,num_comments_words,num_functions,num_classes,num_tests,num_asserts,num_logic,num_loops
count,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0
mean,98.091398,3.392473,62.827957,372.172043,7.129032,0.623656,4.080645,9.951613,6.822581,2.623656
std,86.896006,2.872375,58.790903,342.152238,4.915999,1.516853,2.843476,9.056747,8.939782,4.994947
min,5.0,0.0,1.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,46.0,2.0,23.0,180.5,4.0,0.0,3.0,3.0,2.0,0.0
50%,79.0,3.0,48.5,296.0,6.0,0.0,3.0,8.0,4.0,1.0
75%,123.5,4.0,82.0,469.75,9.0,1.0,5.0,12.75,9.0,3.0
max,601.0,20.0,397.0,2614.0,33.0,14.0,19.0,57.0,82.0,47.0


# Combining dataframes
The newly generated dataframe will be combined with the existing dataframe with data collected from cogs18 class surveys.

In [14]:
# read in the csv file

df_survey = pd.read_csv("/Users/haoyangguo/Documents/GitHub/cogs18_projects/00_feature_engineering/csv/grades_survey_wi21_sp21.csv")

In [15]:
# peek the dataframe

df_survey.head()

Unnamed: 0,anonymized,total_points,programming_background,pre_skills,Concept (Total),File Structure (Total),Project Description (Total),Approach (Total),Code (Total),Style (Total),Documentation (Total),Tests (Total),Fudge Points,Extra Credit,Total Score,instructor,quarter
0,WI21_B00_001,98.5,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,28.0,10.0,9.0,8.0,,,95.0,Ellis,WI21
1,WI21_B00_002,99.53,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,27.0,10.0,8.0,10.0,,,94.5,Ellis,WI21
2,WI21_B00_003,92.97,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,30.0,10.0,10.0,7.5,,,100.0,Ellis,WI21
3,WI21_B00_004,94.69,Matlab,"Read basic Python programs, recognizing the st...",5.0,5.0,0.0,18.0,30.0,10.0,10.0,10.0,,,88.5,Ellis,WI21
4,WI21_B00_005,98.72,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,27.0,10.0,8.0,10.0,,,98.5,Ellis,WI21


In [16]:
# prepare the dataframe for joining

df_survey = df_survey.rename(columns = {'anonymized':'project_name'})

In [17]:
df_survey.head()

Unnamed: 0,project_name,total_points,programming_background,pre_skills,Concept (Total),File Structure (Total),Project Description (Total),Approach (Total),Code (Total),Style (Total),Documentation (Total),Tests (Total),Fudge Points,Extra Credit,Total Score,instructor,quarter
0,WI21_B00_001,98.5,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,28.0,10.0,9.0,8.0,,,95.0,Ellis,WI21
1,WI21_B00_002,99.53,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,27.0,10.0,8.0,10.0,,,94.5,Ellis,WI21
2,WI21_B00_003,92.97,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,30.0,10.0,10.0,7.5,,,100.0,Ellis,WI21
3,WI21_B00_004,94.69,Matlab,"Read basic Python programs, recognizing the st...",5.0,5.0,0.0,18.0,30.0,10.0,10.0,10.0,,,88.5,Ellis,WI21
4,WI21_B00_005,98.72,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,27.0,10.0,8.0,10.0,,,98.5,Ellis,WI21


In [18]:
df_survey.shape

(142, 17)

In [19]:
# left join the dataframes

df_final = pd.merge(df_survey, df, on="project_name", how="left")

In [20]:
# check the result

df_final.head(10)

Unnamed: 0,project_name,total_points,programming_background,pre_skills,Concept (Total),File Structure (Total),Project Description (Total),Approach (Total),Code (Total),Style (Total),...,imports,lines_comments,num_comments_words,num_functions,num_classes,num_tests,num_asserts,num_logic,num_loops,project_quarter
0,WI21_B00_001,98.5,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,28.0,10.0,...,[],41,252,4,0,2,5,5,1,WI21
1,WI21_B00_002,99.53,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,27.0,10.0,...,"[sys, pandas, numpy, my_module.functions]",52,299,3,0,3,18,9,3,WI21
2,WI21_B00_003,92.97,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,30.0,10.0,...,"[datetime, numpy, pandas, matplotlib.pyplot, p...",57,302,4,0,4,7,2,2,WI21
3,WI21_B00_004,94.69,Matlab,"Read basic Python programs, recognizing the st...",5.0,5.0,0.0,18.0,30.0,10.0,...,[string],71,298,8,0,3,9,12,5,WI21
4,WI21_B00_005,98.72,I've never programmed before!,None of these,5.0,4.5,10.0,20.0,27.0,10.0,...,"[sys, my_module.functions]",18,167,4,1,3,12,1,0,WI21
5,WI21_B00_006,89.47,I've never programmed before!,None of these,5.0,4.5,10.0,17.0,30.0,9.0,...,[pandas],76,465,7,1,3,14,6,0,WI21
6,WI21_B00_007,95.71,I've never programmed before!,None of these,5.0,4.5,10.0,15.0,30.0,9.0,...,"[sys, sys, my_module.functions]",48,239,4,0,4,12,10,0,WI21
7,WI21_B00_008,85.09,I've never programmed before!,None of these,5.0,4.5,10.0,16.0,22.0,9.0,...,"[string, random, random, random, random, rando...",42,188,12,0,11,31,10,7,WI21
8,WI21_B00_009,99.39,I've never programmed before!,None of these,5.0,5.0,10.0,20.0,30.0,9.0,...,[string],133,785,10,0,5,21,10,1,WI21
9,WI21_B00_010,91.83,Javascript,None of these,,,,,,,...,"[sys, my_module.functions]",9,55,2,0,2,2,0,0,WI21


In [21]:
df_final.shape

(142, 29)

# Exporting
Since all desired data is extracted from students' projects and properly combined with the existing data, the resulting dataframe will be exported to an csv file for further cleaning and wrangling.

In [22]:
# export engineered dataframe into a csv file ready to be cleaned and wrangled

df_final.to_csv(path_or_buf="/Users/haoyangguo/Documents/GitHub/cogs18_projects/01_data_analysis/csv/cogs18_projects.csv",index=False)