**Step 1: Loading the Dataset**

In [None]:
import pandas as pd

# Load the dataset
df=pd.read_csv('/content/drive/MyDrive/cognorise/Data analytics/project2/cereal.csv')

# Display the first few rows
df.head()


**step 2 :Understanding the Dataset**

Check the basic information about the dataset:



In [None]:
# Check the first few rows of the dataset
df.head()

# Display basic information about the dataset
df.info()

# Display summary statistics
df.describe()


**step 3 :Cleaning the Data**

Check for missing values and handle them:

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:

from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['name']=encoder.fit_transform(df['name'])
# Handle missing values if any (for simplicity, we'll drop them)
df.dropna(inplace=True)


**step 4: Exploratory Data Analysis**

1.Column Distribution: Check the distribution of each column.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for numerical columns
df.hist(bins=20, figsize=(15, 10))
plt.tight_layout()
plt.show()


In [None]:
# Identify non-numeric columns
non_numeric_cols = df.select_dtypes(include='object').columns

# Print the non-numeric columns
print(f"Non-numeric columns: {non_numeric_cols}")

In [None]:

# Drop non-numeric columns except 'name'

# columns_to_drop = non_numeric_cols.drop('name')
df.drop(non_numeric_cols, axis=1, inplace=True)


In [None]:
# Convert non-numeric columns to numeric values (if possible)
for col in non_numeric_cols:
    try:
        df[col] = pd.to_numeric(df[col])
    except:
        # Handle cases where conversion is not possible
        pass

In [None]:

# Impute missing values with a suitable strategy
df.fillna(value=0, inplace=True)


In [None]:

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Display the correlation matrix
print(correlation_matrix)


2**Correlation Matrix:** See how features are correlated.

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


3-**Analysis of Nutritional Content:**

Calories: Check the distribution of calories.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['calories'], bins=20, kde=True)
plt.title('Distribution of Calories')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.show()


**Sugars:** Check the distribution of sugar content.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['sugars'], bins=20, kde=True)
plt.title('Distribution of Sugars')
plt.xlabel('Sugars')
plt.ylabel('Frequency')
plt.show()


**Fiber vs. Sugar:** Analyze the relationship between fiber and sugar.


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='fiber', y='sugars')
plt.title('Fiber vs. Sugars')
plt.xlabel('Fiber')
plt.ylabel('Sugars')
plt.show()


**4-Top 10 Cereals by Rating:**

In [None]:
df['name']=encoder.inverse_transform(df['name'])



In [None]:
# Top 10 Cereals by Rating
top_10_cereals = df.nlargest(10, 'rating')


In [None]:
# Ensure the column name for cereal names is correct
print(top_10_cereals.columns)


In [None]:
# Plot the Top 10 Cereals by Rating
plt.figure(figsize=(12, 6))
sns.barplot(data=top_10_cereals, x="rating", y="name")
plt.title("Top 10 Cereals by Rating")
plt.xlabel("Rating")
plt.ylabel("Cereal Name")
plt.show()