# Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,it can read files like Excel or Csv.
# Numpy provides support for multidimensional arrays, matrices, and a collection of mathematical functions to operate on these arrays efficiently
# Matplotlib is used to visualize the data

# Import library

In [None]:
#if library does not exist 
# pip install pandas
# pip install numpy
# pip install matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read file

In [None]:
#if the file is not in the same directory you need to write the directory 
#make sure the file extension is xlsx or Csv
data = pd.read_csv("sleep.csv")
#first 5 columns
data.head()

In [None]:
#last 5 columns
data.tail()

In [None]:
#random 10 samples
data.sample(10)

# Understand data

In [None]:
#defult of head show the first 5 you can type more if you want 
data.head(6)

In [None]:
#show coulmns name
data.columns

In [None]:
#number of rows
data.index

In [None]:
#show columns and rows
data.shape

In [None]:
#statistics  
data.describe()

# Column slicing

lets try to slice column by two ways 

In [None]:
#if there is no space use . like .age
data.Age

In [None]:
#if there is space use [''],data.Sleep Duration u will got error
occ = data['Sleep Duration']

In [None]:
#i can make any opreation on column
Avg_age = data.Age.mean()
Avg_age

In [None]:
#unique value
data.Age.unique()

In [None]:
#number of unique values
data.Age.nunique()

In [None]:
#type of column 
data.Age.dtypes

In [None]:
#count of values 
data['Heart Rate'].value_counts()

# How to slice many columns 

Lets try to slice columns by three ways 

In [None]:
#first method
Age_Geder_SleepDuration = data[['Age','Gender','Sleep Duration']]
Age_Geder_SleepDuration.head()

In [None]:
#second iloc select columns using index 
# iloc[rows,columns]
four_col = data.iloc[:,0:5]
four_col.head()

In [None]:
#loc is use to spicify column with a condtion
Male = data.loc[data.Gender == 'Male',:]
Male.head()

In [None]:
BMI_Pressure = data.loc[: , ['BMI Category' , 'Blood Pressure']]
BMI_Pressure.head()

In [None]:
#two condtions using and
Male_Doctor = data.loc[(data['Gender'] == 'Male') & (data['Occupation'] == 'Doctor'),:]
Male_Doctor.head()

In [None]:
# twoc condtions using or
Female_steps = data.loc[ (data['Gender'] == 'Female') | (data['Daily Steps'] > 8000),:]
Female_steps.head()

# Cleaning 

In [None]:
data.isnull().sum()

- we should remove null values by replacing them or delete them 
- in numrical data replace null values with average values
- categorical data replace null values with most frequant data

In [None]:
data['Occupation'].value_counts()

In [None]:
#replace categorical
data['Occupation'] = data['Occupation'].replace(np.nan,"Nurse")
data['Sleep Disorder'] = data['Sleep Disorder'].replace(np.nan,'no effect')

In [None]:
#replace numerical values with mean values
data['Heart Rate'] = data['Heart Rate'].replace(np.nan,data['Heart Rate'].mean())
data['Sleep Duration'] = data['Sleep Duration'].replace(np.nan,data['Sleep Duration'].mean())

In [None]:
# Delete nulls
data = data.dropna()

In [None]:
#no null
data.isnull().sum()

In [None]:
#we have in BMI normal & normal weight so its the same thing 
#lets replace it 
data['BMI Category'] = data['BMI Category'].replace('Normal Weight','Normal')
data.head()

In [None]:
#drop dublicates
data = data.drop_duplicates()

# Insights

In [None]:
df1 = data[['Gender','Sleep Duration']]
df1.head()
mean_sleep_duration_by_gender = df1.groupby('Gender')['Sleep Duration'].mean()
mean_sleep_duration_by_gender

In [None]:
#groupby two columns
grouped_data = data.groupby(['Occupation', 'BMI Category']).agg({'Sleep Duration': 'mean'})
grouped_data

# Concat & Merge dataframes

In [None]:
df1.head()

In [None]:
df2 = data[["BMI Category",'Sleep Disorder']]
df2.head()

In [None]:
concatenated_df = pd.concat([df1, df2], ignore_index=True)
concatenated_df

# If u want to merge you need a common column to merge on it like vlookup in excel 
EX:merged_df = pd.merge(df1, df2, on='ID', how='inner')

# Visualization

In [None]:
mean_sleep_duration_by_gender = df1.groupby('Gender')['Sleep Duration'].mean()

# Create a pie chart
plt.pie(mean_sleep_duration_by_gender, labels=mean_sleep_duration_by_gender.index, autopct='%1.1f%%')

# Add title
plt.title('Sleep duration by gender')

# Add x-axis label
plt.xlabel('Gender')

# Add y-axis label
plt.ylabel('Mean Sleep Duration')

# Show the pie chart
plt.show()

In [None]:
# Grouping data by 'Occupation' and calculating the average quality of sleep for each occupation
occupation_quality = data.groupby('Occupation')['Quality of Sleep'].mean()

# Extracting occupation names and corresponding quality of sleep values
occupations = occupation_quality.index
quality_of_sleep = occupation_quality.values

# Creating a bar chart
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
plt.bar(occupations, quality_of_sleep, color='skyblue')

# Adding title and labels
plt.title('Average Quality of Sleep by Occupation')
plt.xlabel('Occupation')
plt.ylabel('Quality of Sleep')

# Rotating x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Displaying the plot
plt.show()

In [None]:
# Assuming you have a DataFrame named data with a 'Heart Rate' column
heart_rate_data = data['Heart Rate']

# Creating a histogram
plt.figure(figsize=(8, 6))  # Adjust figure size as needed
plt.hist(heart_rate_data, bins=5, color='red', edgecolor='black')

# Adding title and labels
plt.title('Histogram of Heart Rate')
plt.xlabel('Heart Rate')
plt.ylabel('rates')

# Displaying the plot
plt.grid(True)
plt.show()