In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import KFold, cross_val_score 
import statsmodels.api as sm

In [3]:
# read in mulitple csv files
csv_files = ['data/C200.csv'
            ]

# Initialize an empty list to store DataFrames
dataframes = []

# Read the first CSV file to get the column headers
initial_df = pd.read_csv(csv_files[0])
initial_columns = initial_df.columns

# Loop through the list of files
for file in csv_files:
    # Read the current CSV file into a DataFrame, ensuring it matches the initial columns
    df = pd.read_csv(file, usecols=lambda column: column in initial_columns).reindex(columns=initial_columns)
       
    # Drop rows where all cells are blank
    df.dropna(how='all', inplace=True)
    
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
C200_df = pd.concat(dataframes, ignore_index=True)

# Optionally, handle blanks in the compiled DataFrame as well

# Save the compiled DataFrame
C200_df.to_csv('cleaned_data/C200_cleaned_df.csv', index=False)

## Load the Compiled DataFrame

In [None]:
# Load the compiled DataFrame
C200_df = pd.read_csv('cleaned_data/C200_cleaned_df.csv')

# Split the 'learner_id' column
C200_df['team_number'] = C200_df['learner_id'].str.extract('(\d+)')
C200_df['section'] = C200_df['learner_id'].str.extract('([A-Z]+)')

# Convert 'team_number' to numeric
C200_df['team_number'] = pd.to_numeric(C200_df['team_number'])

# Drop the original 'learner_id' column if no longer needed
C200_df.drop(columns=['learner_id'], inplace=True)

# Remove duplicate rows
C200_df.drop_duplicates(inplace=True)

# Save the updated DataFrame
C200_df.to_csv('data/C200_updated_df.csv', index=False)

# Display the first few rows to check the result
print(C200_df.head())


##  Basic Descriptive Statistics

In [None]:
# Summary statistics for numerical columns
numerical_summary = C200_df.describe()

# Summary statistics for categorical columns
categorical_summary = C200_df.describe(include=['object', 'category'])

# Get information about the DataFrame
info = C200_df.info()

# Display the summaries
print(numerical_summary)
print(categorical_summary)
print(info)


##  Visualizations

In [None]:
# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Histogram of a numerical column (replace 'numerical_column' with your actual column name)
plt.figure(figsize=(10, 6))
sns.histplot(C200_df['numerical_column'], kde=True)
plt.title('Distribution of Numerical Column')
plt.xlabel('Numerical Column')
plt.ylabel('Frequency')
plt.show()

# Bar plot of a categorical column (replace 'categorical_column' with your actual column name)
plt.figure(figsize=(10, 6))
sns.countplot(data=C200_df, x='categorical_column')
plt.title('Frequency of Categorical Column')
plt.xlabel('Categorical Column')
plt.ylabel('Count')
plt.show()

# Box plot of a numerical column (replace 'numerical_column' with your actual column name)
plt.figure(figsize=(10, 6))
sns.boxplot(data=C200_df, y='numerical_column')
plt.title('Box Plot of Numerical Column')
plt.ylabel('Numerical Column')
plt.show()


In [None]:
# Group by a categorical column and calculate the mean of numerical columns (replace with actual column names)
grouped_summary = C200_df.groupby('categorical_column').mean()

print(grouped_summary)


In [None]:
# Load the compiled DataFrame
C200_df = pd.read_csv('cleaned_data/C200_compiled_dataframe.csv')

# Basic Descriptive Statistics
numerical_summary = C200_df[['age', 'score']].describe()
categorical_summary = C200_df['gender'].value_counts()
info = C200_df.info()

# Display the summaries
print(numerical_summary)
print(categorical_summary)
print(info)

# Visualizations
# Histogram of 'age'
plt.figure(figsize=(10, 6))
sns.histplot(C200_df['age'], kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Bar plot of 'gender'
plt.figure(figsize=(10, 6))
sns.countplot(data=C200_df, x='gender')
plt.title('Frequency of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

# Box plot of 'score'
plt.figure(figsize=(10, 6))
sns.boxplot(data=C200_df, y='score')
plt.title('Box Plot of Score')
plt.ylabel('Score')
plt.show()

# Group by 'gender' and calculate the mean of 'age' and 'score'
grouped_summary = C200_df.groupby('gender')[['age', 'score']].mean()
print(grouped_summary)
