In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pandas as pd

file_path = '/content/drive/MyDrive/EDA-LAB/DA-1\/CASchools.csv'
data = pd.read_csv(file_path)

data_info = {
    "Head": data.head(),
    "Shape": data.shape,
    "Info": data.info(),
    "Description": data.describe()
}

data_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rownames     420 non-null    int64  
 1   district     420 non-null    int64  
 2   school       420 non-null    object 
 3   county       420 non-null    object 
 4   grades       420 non-null    object 
 5   students     420 non-null    int64  
 6   teachers     420 non-null    float64
 7   calworks     420 non-null    float64
 8   lunch        420 non-null    float64
 9   computer     420 non-null    int64  
 10  expenditure  420 non-null    float64
 11  income       420 non-null    float64
 12  english      420 non-null    float64
 13  read         420 non-null    float64
 14  math         420 non-null    float64
dtypes: float64(8), int64(4), object(3)
memory usage: 49.3+ KB


{'Head':    rownames  district                           school   county grades  \
 0         1     75119               Sunol Glen Unified  Alameda  KK-08   
 1         2     61499             Manzanita Elementary    Butte  KK-08   
 2         3     61549      Thermalito Union Elementary    Butte  KK-08   
 3         4     61457  Golden Feather Union Elementary    Butte  KK-08   
 4         5     61523         Palermo Union Elementary    Butte  KK-08   
 
    students   teachers   calworks      lunch  computer  expenditure  \
 0       195  10.900000   0.510200   2.040800        67  6384.911133   
 1       240  11.150000  15.416700  47.916698       101  5099.380859   
 2      1550  82.900002  55.032299  76.322601       169  5501.954590   
 3       243  14.000000  36.475399  77.049202        85  7101.831055   
 4      1335  71.500000  33.108601  78.427002       171  5235.987793   
 
       income    english        read        math  
 0  22.690001   0.000000  691.599976  690.000000  
 1  

In [17]:
import matplotlib.pyplot as plt
import seaborn as sns


sns.set(style="whitegrid")
sns.set_palette("viridis")

features = data.drop(columns=['math', 'rownames', 'school'])




In [18]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
label_encoders = {}
for column in features.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    features[column] = le.fit_transform(features[column])
    label_encoders[column] = le



In [19]:
# Standardize numerical features for visual consistency
scaler = StandardScaler()
numerical_features = features.select_dtypes(include=['int64', 'float64']).columns
features[numerical_features] = scaler.fit_transform(features[numerical_features])



In [None]:
# 2. Univariate Analysis - Histograms and box plots for numerical features
fig, axes = plt.subplots(len(numerical_features), 2, figsize=(12, 4 * len(numerical_features)))

for i, col in enumerate(numerical_features):
    # Histogram
    sns.histplot(features[col], kde=True, ax=axes[i, 0])
    axes[i, 0].set_title(f'Histogram of {col}')

    # Box plot
    sns.boxplot(x=features[col], ax=axes[i, 1])
    axes[i, 1].set_title(f'Box Plot of {col}')

plt.tight_layout()
plt.show()



In [None]:
# 3. Bivariate Analysis - Correlation Matrix Heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = features.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix of Features")
plt.show()


In [None]:
# Additional Bivariate Analysis - Scatter Plots for Selected Pairs
selected_pairs = [('students', 'teachers'), ('income', 'expenditure'), ('read', 'lunch')]
for x, y in selected_pairs:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=features, x=x, y=y)
    plt.title(f'Scatter Plot: {x} vs {y}')
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()



In [None]:
# 4. Multivariate Analysis - Pair Plot
try:
    sns.pairplot(features, diag_kind='kde', plot_kws={'alpha': 0.5})
    plt.suptitle("Pair Plot of Numerical Variables", y=1.02)
    plt.show()
except Exception as e:
    print("Pair plot generation encountered an issue:", e)



In [None]:
# Multivariate Analysis - Exploring Relationships with Boxplots
plt.figure(figsize=(12, 6))
sns.boxplot(x='grades', y='read', data=data)
plt.title('Boxplot of Grades vs Reading Scores')
plt.xlabel('Grades')
plt.ylabel('Reading Scores')
plt.show()



In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='grades', y='math', data=data)
plt.title('Boxplot of Grades vs Math Scores')
plt.xlabel('Grades')
plt.ylabel('Math Scores')
plt.show()


In [None]:
# 5. Additional EDA Methods

# Categorical Data Analysis - Bar Plots for Categorical Variables
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=col, data=data)
    plt.title(f'Count Plot of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()



In [None]:
# Outlier Detection - Z-Score Method
from scipy import stats
for col in numerical_features:
    z_scores = stats.zscore(features[col])
    outliers = (abs(z_scores) > 3)
    print(f"Number of outliers in {col}: {sum(outliers)}")



In [None]:
# PCA for Dimensionality Reduction
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(features[numerical_features])

plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_features[:, 0], y=pca_features[:, 1], hue=data['grades'], palette='viridis')
plt.title('PCA of Numerical Features')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()



In [None]:
# Clustering - KMeans for Segmentation
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
features['Cluster'] = kmeans.fit_predict(features[numerical_features])

plt.figure(figsize=(10, 6))
sns.scatterplot(x=features['income'], y=features['expenditure'], hue=features['Cluster'], palette='viridis')
plt.title('KMeans Clustering of Features')
plt.xlabel('Income')
plt.ylabel('Expenditure')
plt.show()
