<a href="https://colab.research.google.com/github/Jessvcv/AI-Projects/blob/main/DataCleaning%26Organizing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv

# Step 1: Read data and loop through the contents line by line
def read_data(file_path):
    valid_rows = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        next(reader)  # Skip header row
        for idx, row in enumerate(reader):
            if len(row) < 10:  # Validate row length
                #print(f"Skipping row {idx + 2}: {row} (Incomplete data)")
                continue
            valid_rows.append(row)
    return valid_rows

# Step 2: Takes a list of elements and converts it to appropriate types
def convert_row_type(row):
    return [
        row[0].strip(),  # Student name
        float(row[1]),   # SAT
        float(row[2]),   # GPA
        float(row[3]),   # Interest
        float(row[4]),   # High School Quality
    ] + list(map(float, row[5:9])) + [1 if row[9].strip() == 'in' else 0]  # Grades and in-out state

# Step 3: Normalize data and compute student score
def compute_student_score(data_row):
    sat_norm = data_row[1] / 160
    gpa_norm = data_row[2] * 2
    interest = data_row[3]
    hs_quality = data_row[4]
    in_state = data_row[9]
    return round((gpa_norm * 0.4) + (sat_norm * 0.3) + (hs_quality * 0.2) + (interest * 0.05) + (in_state * 0.05), 2)

# Split data
def split_student_data(student_data):
    academic_data = student_data[:5]  # First 5 elements
    grades_data = student_data[5:9]  # Last 4 elements
    return academic_data, grades_data

# Step 4: Find outliers
def is_outlier(row, score):
    gpa_norm = row[2] * 2
    sat_norm = row[1] / 160
    interest = row[3]
    return interest == 0 or (gpa_norm - sat_norm > 2 and score >= 5)

# Step 5: Check for grade outliers
def has_grade_outlier(grades):
    grades = sorted(grades)
    return grades[1] - grades[0] > 20

# Step 6: Check for grade improvement
def grade_improvement(grades):
    return grades == sorted(grades)

# Step 7: Combine criteria for admission
def combine_criteria(row, score):
    grades = row[5:9]
    outlier = is_outlier(row, score)
    improvement = grade_improvement(grades)
    grade_outlier = has_grade_outlier(grades)
    return score >= 6 or (score >= 5 and (outlier or improvement or grade_outlier))

# Save scores to a file
def save_scores(scores, file_path):
    with open(file_path, 'w') as file:
        for name, score in scores:
            file.write(f"{name}\t{score}\n")

# Save combined results to a file
def save_combined(combined, file_path):
    with open(file_path, 'w') as file:
        for name, score in combined:
            file.write(f"{name}\n")

# Main function
def main():
    # Input and output files
    input_file = "/content/admissions_test2.csv"
    chosen_file = "chosen_students.txt"
    outliers_file = "outliers.txt"
    combined_file = "extra_improved_chosen.txt"

    # Step 1: Read and process data
    data = read_data(input_file)
    processed_data = [convert_row_type(row) for row in data]

    # Step 2: Compute scores
    scores = [(row[0], compute_student_score(row)) for row in processed_data]

    # Step 3: Save students with scores >= 6
    high_scores = [(name, score) for name, score in scores if score >= 6]
    save_scores(high_scores, chosen_file)

    # Step 4: Identify outliers
    outliers = [(row[0], compute_student_score(row)) for row in processed_data if is_outlier(row, compute_student_score(row))]
    save_scores(outliers, outliers_file)

    # Step 5: Combine criteria
    combined = [(row[0], compute_student_score(row)) for row in processed_data if combine_criteria(row, compute_student_score(row))]
    save_combined(combined, combined_file)

# Execute main
main()
