In [1]:
import pandas as pd

In [12]:
#Reading the csv data file
file = "/content/sample_data/StudentsPerformance.csv"
stu_data = pd.read_csv(file)

In [13]:
#Summarizing statistics of the dataset
print("Summary of the dataset:")
print(stu_data.describe())

Summary of the dataset:
       math score  reading score  writing score
count  1000.00000    1000.000000    1000.000000
mean     66.08900      69.169000      68.054000
std      15.16308      14.600192      15.195657
min       0.00000      17.000000      10.000000
25%      57.00000      59.000000      57.750000
50%      66.00000      70.000000      69.000000
75%      77.00000      79.000000      79.000000
max     100.00000     100.000000     100.000000


In [14]:
#Changing the DataFrame to ensure data frame structure
stu_data = pd.DataFrame(stu_data)

In [15]:
#Preprocessing the data
#Converting the categorical values into numerical values
stu_data['gender'].replace(['female', 'male'], [0, 1], inplace=True)
stu_data['test preparation course'].replace(['none', 'completed'], [0, 1], inplace=True)

In [16]:
#Generating a new column total score
stu_data['Total score'] = stu_data['math score'] + stu_data['writing score'] + stu_data['reading score']

In [17]:
#Printing column names and dimensions
print("Column names:")
print(stu_data.columns)
print("\nDimensions:")
print(stu_data.shape)

Column names:
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score', 'Total score'],
      dtype='object')

Dimensions:
(1000, 9)


In [18]:
#Printing the top and bottom rows of the DataFrame
print("\nTop rows:")
print(stu_data.head())
print("\nBottom rows:")
print(stu_data.tail())


Top rows:
   gender race/ethnicity parental level of education         lunch  \
0       0        group B           bachelor's degree      standard   
1       0        group C                some college      standard   
2       0        group B             master's degree      standard   
3       1        group A          associate's degree  free/reduced   
4       1        group C                some college      standard   

   test preparation course  math score  reading score  writing score  \
0                        0          72             72             74   
1                        1          69             90             88   
2                        0          90             95             93   
3                        0          47             57             44   
4                        0          76             78             75   

   Total score  
0          218  
1          247  
2          278  
3          148  
4          229  

Bottom rows:
     gender race/et

In [19]:
#Printing the data structure
print("\nData Structure:")
print(stu_data.info())


Data Structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   int64 
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   int64 
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
 8   Total score                  1000 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 70.4+ KB
None


In [20]:
#Printing object columns to categorical columns
stu_data['gender'] = pd.Categorical(stu_data['gender'])
stu_data['race/ethnicity'] = pd.Categorical(stu_data['race/ethnicity'])
stu_data['parental level of education'] = pd.Categorical(stu_data['parental level of education'])
stu_data['lunch'] = pd.Categorical(stu_data['lunch'])
stu_data['test preparation course'] = pd.Categorical(stu_data['test preparation course'])

In [21]:
#Deleting unnecessary columns
stu_data.drop(columns=['lunch'], inplace=True)

In [22]:
#Printing summary statistics after removing unnecessary columns
print("\nSummary after removing unnecessary columns:")
print(stu_data.describe())


Summary after removing unnecessary columns:
       math score  reading score  writing score  Total score
count  1000.00000    1000.000000    1000.000000  1000.000000
mean     66.08900      69.169000      68.054000   203.312000
std      15.16308      14.600192      15.195657    42.771978
min       0.00000      17.000000      10.000000    27.000000
25%      57.00000      59.000000      57.750000   175.000000
50%      66.00000      70.000000      69.000000   205.000000
75%      77.00000      79.000000      79.000000   233.000000
max     100.00000     100.000000     100.000000   300.000000


In [23]:
#Searching for missing values
print("\n Missing values:")
print(stu_data.isna().any().any())


 Missing values:
False


In [24]:
#Storing the clean data
clean_file_path = "/content/sample_data/clean_data_file.csv"
stu_data.to_csv(clean_file_path, index=False)