In [None]:
import pandas as pd

# Load the dataset
file_path = "How Effective Are AI-Powered Tools in Enhancing Student Academic Success.csv"
df = pd.read_csv(file_path)


# Remove unnecessary columns
cols_to_drop = [
    "Respondent ID", "Collector ID", "Start Date", "End Date", 
    "IP Address", "Email Address", "First Name", "Last Name", "Custom Data 1"
]
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Define and drop the consent column
consent_col = "CONSENT STATEMENTI have read and understood the participant information above and I freely and voluntarily give my consent to participate in this project/study."
df.drop(columns=[consent_col], inplace=True, errors='ignore')

# Remove the first row (assumed to be metadata)
df = df.iloc[1:].reset_index(drop=True)

# Drop any columns whose names start with "Unnamed"
df = df.loc[:, ~df.columns.str.startswith("Unnamed")]

# Remove rows where AI awareness column is NaN
df.dropna(subset=['Are you aware of AI tools like ChatGPT?'], inplace=True)

# Add an ID column (starting from 1)
df.insert(0, 'ID', range(1, len(df) + 1))

# Rename some of the columns for clarity
df.rename(columns={
    'What is your gender?': 'Gender',
    'What is your age range?': 'Age Group',
    'What is your course of study?': 'Course of Study',
    'What is your level of study?:': 'Degree Level',
    'What country are you studying in?': 'Country of Study',
}, inplace=True)

# Creating a dictionary to map variations of course names to standardized names
course_column = "Course of Study"
course_mapping = {
    "Law": ["LLB", "law", "LAW"],
    "IT & BIS": [
        "information technology and business information systems",
        "Information Technology and Business Information Systems", 
        "BSc Information Technology and Business Information Systems Top-Up", 
        "Bsc Information Technology and Business Information System"
        "IT & Business Information Systems",
        "IT and Business",
        "IT and Business Information Systems",
        "IT and business information systems",
        "formation Technology and Business Information Systems",
        "BSs information technology and business information systems",
        "INFORMATION TECHNOLOGY AND BUSINESS INFORMATION SYSTEMS",
        "BSc IT and BIS",
        "IT and BIS",
    ],
    "Accountancy": ["Accountancy", "Accounting"],
    "Medicine and Surgery": ["Medicine", "Medicine & Surgery"],
    "Physics": ["Physics", "Physics and Astronomy"],
    "Nursing": ["Nursing", "Nursing sci."],
}

# Function to standardize course names with fuzzy matching for IT & BIS
def standardize_course(course_name):
    if pd.isna(course_name): 
        return course_name
    course_name = course_name.strip()  
    course_lower = course_name.lower()
    
    for standard, variations in course_mapping.items():
        if course_name in variations:
            return standard
    
    if "information" in course_lower or "bis" in course_lower:
        return "IT & BIS"
    
    return course_name.title()

# Apply the function to the course column
df['Course of Study'] = df['Course of Study'].apply(standardize_course)

# Function to standardize Country of Study
def standardize_country(country_name):
    if pd.isna(country_name):
        return "United Kingdom"

    if any(word in country_name.lower() for word in ["states", "usa"]):
        return "United States"
    if any(word in country_name.lower() for word in ["kingdom", "england", "london", "uk"]):
        return "United Kingdom"
    if "dudai" in country_name.lower():
        return "Dubai"
    return country_name.title()

# Apply the function to the country column
df['Country of Study'] = df['Country of Study'].apply(standardize_country)

# Replace NaN values in the suggestions column
suggestions_col = "What suggestions do you have for making AI tools more beneficial in education?"
df[suggestions_col] = df[suggestions_col].fillna("I do not know")

# export the cleaned DataFrame to a new CSV file
df.to_csv('cleaned_dataset.csv', index=False)
# df.head(50)


Index(['ID', 'Gender', 'Age Group', 'Course of Study',
       'What is your level of study?', 'Country of Study',
       'Are you aware of AI tools like ChatGPT?',
       'Do you use AI tools in your education?',
       'If yes, in what ways do you use AI tools for learning?',
       'Do you use AI tools to understand difficult topics taught in class?',
       'How frequently do you use AI tools for understanding difficult topics?',
       'Do you use AI tools as an aid in completing assignments?',
       'How often do you use AI tools for assignments?',
       'When completing assignments, do you still conduct additional research beyond AI-generated responses?',
       'How effective do you think AI tools are in explaining difficult topics and improving your understanding?',
       'Can you provide an example of when an AI tool helped you understand a concept better?',
       'How would you describe your dependence on AI tools for school work?',
       'If you feel dependent on AI too