In [1]:
import csv

# Function to read data using the CSV module
def read_csv_file(file_path):
    data = []
    try:
        with open(file_path, mode='r', newline='', encoding='utf-8') as csv_file:
            csv_reader = csv.DictReader(csv_file)  
            data = [row for row in csv_reader]  
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except IOError as error:
        print(f"Failed to read CSV file: {error}")
    return data

# Example 
file_path = "students_data.csv"
csv_data = read_csv_file(file_path)
print(csv_data[:5])  # Print first 5 records


[{'ID': 'ID-371', 'Sex': 'Female', 'Age': '17', 'Address': 'Urban', 'Family_size': 'greater than 3', 'Parent_cohabitation_status': 'Together', 'Mother_education_level ': 'primary (5th-9th grade)', 'Father_education_level': 'secondary', 'Mother_job': 'at_home', 'Father_job': 'at_home', 'Guardian': 'mother', 'Traveltime': '3', 'Studytime': '2', 'Failures': '1', 'School_support': 'no', 'Family_support': 'no', 'Parent_aid': 'no', 'Activities': 'yes', 'Nursery_attend': 'yes', 'Access_internet': 'no', 'In_Romantic_relationship': 'yes', 'Family_relative': '2', 'Freetime': '2', 'Goout': '4', 'Health': 'good', 'Absences': '89', 'State': 'Florida', 'Race': 'Asian', 'Math_score': '21', 'Reading_score': '33', 'Writing_score': '36', 'Attendance_rate': '72.27', 'Suspensions': '0', 'Expulsions': '0', 'Teacher_support': 'yes', 'Counseling': 'no', 'Social_worker_visits': '2', 'Parental_involvement': 'low'}, {'ID': 'ID-1352', 'Sex': 'Female', 'Age': '18', 'Address': 'Rural', 'Family_size': 'Less or equa

In [2]:
# Function to read data using the Pandas module
import pandas as pd

def read_pandas_file(file_path):
    try:
        dataset = pd.read_csv(file_path)
    except pd.errors.EmptyDataError:
        print("The CSV file is empty!")
        dataset = pd.DataFrame()
    except pd.errors.ParserError as error:
        print(f"Parsing error while reading the CSV file: {error}")
        dataset = pd.DataFrame()
    return dataset

In [3]:
dataset = read_pandas_file("students_data.csv")
print(dataset.head())


        ID     Sex  Age Address         Family_size  \
0   ID-371  Female   17   Urban      greater than 3   
1  ID-1352  Female   18   Rural  Less or equal to 3   
2  ID-1924  Female   19   Rural  Less or equal to 3   
3  ID-3722  Female   17   Rural      greater than 3   
4  ID-3463  Female   15   Urban      greater than 3   

  Parent_cohabitation_status  Mother_education_level  Father_education_level  \
0                   Together  primary (5th-9th grade)              secondary   
1                   Together  primary (5th-9th grade)       higher education   
2                      Apart         higher education                   none   
3                   Together  primary (5th-9th grade)                   none   
4                   Together         higher education    primary (4th grade)   

  Mother_job Father_job  ... Math_score  Reading_score  Writing_score  \
0    at_home    at_home  ...         21             33             36   
1    teacher     health  ...         20   

## CSV module Task A1 To A4

In [4]:
# Function to gwt data by student ID
def get_data_by_student_id(data, student_id):
    return [entry for entry in data if entry.get('ID') == student_id]

# Function to get data by race
def get_data_by_race(data, race):
    return [entry for entry in data if entry.get('Race') == race]

# Function to get data using  parental involvement level
def get_data_by_parental_involvement(data, involvement_level):
    return [entry for entry in data if entry.get('Parental_involvement') == involvement_level]

# Function to retrieve high-performing students
def get_high_performers(data):
    # Convert necessary columns to float
    data['Math_score'] = data['Math_score'].astype(float)
    data['Writing_score'] = data['Writing_score'].astype(float)
    data['Reading_score'] = data['Reading_score'].astype(float)
    data['Attendance_rate'] = data['Attendance_rate'].astype(float)

    # Calculate average scores
    avg_math = data['Math_score'].mean()
    avg_writing = data['Writing_score'].mean()
    avg_reading = data['Reading_score'].mean()

    # Set high attendance threshold
    attendance_threshold = 90.0

    # Filter high-performing students
    high_performers = data[
        (data['Attendance_rate'] >= attendance_threshold) &
        (data['Reading_score'] > avg_reading) &
        (data['Writing_score'] > avg_writing) &
        (data['Math_score'] > avg_math)  # Fixed missing '&'
    ]

    if high_performers.empty:
        print(f"No students found with attendance >= {attendance_threshold}% and above-average scores in all subjects.")
        print(f"Average scores - Math: {avg_math:.2f}, Reading: {avg_reading:.2f}, Writing: {avg_writing:.2f}")
        print(f"Number of students with attendance >= {attendance_threshold}%: {(data['Attendance_rate'] >= attendance_threshold).sum()}")
        print(f"Number of students above average in Math: {(data['Math_score'] > avg_math).sum()}")
        print(f"Number of students above average in Reading: {(data['Reading_score'] > avg_reading).sum()}")
        print(f"Number of students above average in Writing: {(data['Writing_score'] > avg_writing).sum()}")

    return high_performers[['ID', 'Sex', 'Age', 'Race', 'Math_score', 'Reading_score', 'Writing_score', 'Attendance_rate']].to_dict(orient='records')


## Pandas module Task B1 To B4

In [5]:
import pandas as pd
import matplotlib.pyplot as plt  

#  B1: Top 3 Mother’s Education Levels by Race
def top_mothers_education_by_race(data, race):
    if 'Race' not in data.columns or 'Parental_involvement' not in data.columns:
        print("Error: Required columns not found in dataset.")
        return {}
    
    filtered_data = data[data['Race'] == race]
    top_education = filtered_data['Parental_involvement'].value_counts().head(3)
    return top_education.to_dict()

#  B2: Average Absences by Parental Involvement
def avg_absences_by_parental_involvement(data, parental_involvement):
    if 'Parental_involvement' not in data.columns or 'Absences' not in data.columns:
        print("Error: Required columns not found.")
        return None

    filtered_data = data[data['Parental_involvement'] == parental_involvement]
    return filtered_data['Absences'].astype(float).mean()

#  B3: Average Math Score by Attendance (with >80% attendance)
def avg_maths_score_by_attendance(data, race):
    if 'Race' not in data.columns or 'Attendance_rate' not in data.columns or 'Math_score' not in data.columns:
        print("Error: Required columns not found.")
        return None

    data["Attendance_rate"] = pd.to_numeric(data["Attendance_rate"], errors="coerce")
    filtered_data = data[(data['Race'] == race) & (data['Attendance_rate'] > 80)]
    return filtered_data['Math_score'].astype(float).mean()

#  B4: Impact of Extracurricular Activities on Performance
def analyze_extracurricular_activities(data):
    try:
        summary = (
            data.groupby('Activities')[['Math_score', 'Reading_score', 'Writing_score', 'Absences', 'Attendance_rate']]
            .mean()
            .round(2)
            .rename(columns={
                'Math_score': 'Avg Math Score',
                'Reading_score': 'Avg Reading Score',
                'Writing_score': 'Avg Writing Score',
                'Absences': 'Avg Absences',
                'Attendance_rate': 'Avg Attendance Rate'
            })
        )
        print("\nImpact of Extracurricular Activities on Academic Performance:")
        print(summary)
        return summary.to_dict(orient='index')
    
    except KeyError as e:
        print(f"Missing expected column: {e}")
        return {}
    except Exception as e:
        print(f"Unexpected error: {e}")
        return {}



## Visualization module Task C1 To C4

In [6]:
!pip install matplotlib




In [7]:
#  C1: Race Proportion Pie Chart (Handles Different Column Names)
def race_proportion_Graph(data):
    possible_columns = ["race/ethnicity", "Race", "Ethnicity"]
    race_col = next((col for col in possible_columns if col in data.columns), None)
    
    if race_col is None:
        print("Error: No valid race column found in dataset.")
        return
    
    race_counts = data[race_col].value_counts()
    plt.figure(figsize=(8, 6))
    race_counts.plot(kind='pie', autopct='%1.1f%%', colors=['blue', 'green', 'red', 'purple', 'orange'])
    plt.title("Student Race Proportion")
    plt.ylabel("")  # Remove y-label for better display
    plt.show()

#  C2: Average Writing Scores by Race (Bar Chart)
def avg_writing_scores_Graph(data):
    if 'Race' not in data.columns or 'Writing_score' not in data.columns:
        print("Error: Required columns not found.")
        return

    avg_scores = data.groupby('Race')['Writing_score'].mean()
    avg_scores.plot(kind='bar', color='orange')
    plt.title('Average Writing Scores by Race')
    plt.xlabel('Race')
    plt.ylabel('Average Writing Score')
    plt.show()

#  C3: Writing Score vs Reading Score (Scatter Plot)
def writing_vs_reading_Graph(data):
    if 'Reading_score' not in data.columns or 'Writing_score' not in data.columns:
        print("Error: Required columns not found.")
        return

    plt.scatter(data['Reading_score'], data['Writing_score'], alpha=0.5)
    plt.title('Writing Score vs. Reading Score')
    plt.xlabel('Reading Score') 
    plt.ylabel('Writing Score')  
    plt.show()

#  C4:  based of Absences by Free Time using bar graph
import matplotlib.pyplot as plt


def plot_absences_by_freetime(data):
    if 'Freetime' not in data.columns or 'Absences' not in data.columns:
        print("Error: Required columns not found.")
        return

    plt.figure(figsize=(10, 6))
    
    # Grouping Data by Free Time Levels
    freetime_levels = sorted(data['Freetime'].unique())
    avg_absences = [data[data['Freetime'] == level]['Absences'].mean() for level in freetime_levels]

    # Horizontal Bar Graph
    plt.barh(freetime_levels, avg_absences, color='skyblue', edgecolor='black')

    # Customize Plot
    plt.title('Average Absences by Free Time Level', fontsize=14)
    plt.xlabel('Average Number of Absences', fontsize=12)
    plt.ylabel('Free Time Level', fontsize=12)

    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.show()






