# Summarizing & Computing Descriptive Statistics

# Import necessary libraries

In [1]:
import numpy as np
import pandas as pd

# Create a pandas DataFrame object named 'df' with four rows and two columns. Specify the index paremeter as ['a','b','c','d'] and the column parameter should specify ["one","two"]

In [2]:
df= pd.DataFrame([[2.4, np.nan], [6.3, -5.4],
                  [np.nan, np.nan], [0.75, - 1.3]],
                index = ["a", "b", "c", "d"],
                columns = ["one", "two"])
df

Unnamed: 0,one,two
a,2.4,
b,6.3,-5.4
c,,
d,0.75,-1.3


# Calculate the sum of each column in the DataFrame df. It should return a new Series object that contains the sum of values for each column.

In [3]:
df.sum()

one    9.45
two   -6.70
dtype: float64

# Calculate the sum of each row in the DataFrame df. It should return a new Series object that contains the sum of values for each row.

In [4]:
df.sum(axis = 1 )

a    2.40
b    0.90
c    0.00
d   -0.55
dtype: float64

# Calculate the mean (average) of each row in the DataFrame df. It should return a new Series object that contains the mean value for each row.

In [5]:
df.mean(axis=1)

a    2.400
b    0.450
c      NaN
d   -0.275
dtype: float64

# Calculate the mean (average) of each row in the DataFrame df, considering all values including missing values (NaN). It should return a new Series object that contains the mean value for each row.

In [6]:
df.mean(axis =1 , skipna = False)

a      NaN
b    0.450
c      NaN
d   -0.275
dtype: float64

# Find the index label of the maximum value in each column of the DataFrame df. It should return a new Series object that contains the index label for each column where the maximum value is located.

In [7]:
df.idxmax()

one    b
two    d
dtype: object

# Find the index label of the minimum value in each column of the DataFrame df. It should return a new Series object that contains the index label for each column where the maximum value is located.

In [8]:
df.idxmin()

one    d
two    b
dtype: object

# Calculate the cumulative sum for each column in the DataFrame df. It should return a new DataFrame that contains the cumulative sum of values for each column.

In [9]:
df.cumsum()

Unnamed: 0,one,two
a,2.4,
b,8.7,-5.4
c,,
d,9.45,-6.7


# Generate descriptive statistics for the columns in the DataFrame df. It should return a new DataFrame that provides various statistical measures for each column, including count, mean, standard deviation, minimum value, quartiles, and maximum value.

In [10]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.15,-3.35
std,2.85,2.899138
min,0.75,-5.4
25%,1.575,-4.375
50%,2.4,-3.35
75%,4.35,-2.325
max,6.3,-1.3


# Read a CSV file named "iris.data" and assign the resulting data to a pandas DataFrame called 'iris'

In [13]:
iris = pd.read_csv("Data/iris.data" ,header =None)
iris.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Assign new column names to the DataFrame 'iris'. column names should be ['sepal_length','sepal_width','petal_length','petal_width','class']

In [14]:
iris.columns= ['sepal_length','sepal_width','petal_length','petal_width','class']
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Calculate the correlation coefficient between the "sepal_length" column and the "sepal_width" column in the DataFrame 'iris'.

In [15]:
iris["sepal_length"].corr(iris["sepal_width"])

-0.10936924995064935

# Calculate the correlation matrix for all the columns in the DataFrame 'iris'. It should return a new DataFrame that contains the pairwise correlation coefficients between all pairs of columns.

In [16]:
iris.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.109369,0.871754,0.817954
sepal_width,-0.109369,1.0,-0.420516,-0.356544
petal_length,0.871754,-0.420516,1.0,0.962757
petal_width,0.817954,-0.356544,0.962757,1.0


# Calculate the covariance matrix for all the columns in the DataFrame 'iris'. It should return a new DataFrame that contains the pairwise covariances between all pairs of columns.

In [17]:
iris.cov()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,0.685694,-0.039268,1.273682,0.516904
sepal_width,-0.039268,0.188004,-0.321713,-0.117981
petal_length,1.273682,-0.321713,3.113179,1.296387
petal_width,0.516904,-0.117981,1.296387,0.582414


# Calculate the correlation coefficients between each column in the DataFrame 'iris' and the "petal_length" column. It should return a new Series object that contains the correlation coefficients for each column in relation to the "petal_length" column.

In [18]:
iris.corrwith(iris["petal_length"])

sepal_length    0.871754
sepal_width    -0.420516
petal_length    1.000000
petal_width     0.962757
dtype: float64

# Create a pandas Series object named 's' that contains a sequence of values. ["b","b","b","b","c","c","a","a","a"]

In [20]:
s = pd.Series(["b","b","b","b","c","c","a","a","a"])
s

0    b
1    b
2    b
3    b
4    c
5    c
6    a
7    a
8    a
dtype: object

# Retrieve the unique values from the pandas Series object 's'. It should return an array containing the unique values in the order they appear in the Series.

In [21]:
s.unique()

array(['b', 'c', 'a'], dtype=object)

# Count the occurrences of each unique value in the pandas Series object 's'. It should returns a new Series that represents the frequency count of each unique value.

In [23]:
s.value_counts()

b    4
a    3
c    2
dtype: int64

# Create a new pandas Series object named 'x' that indicates whether each value in the Series 's' is present in the list ["b", "c"].

In [24]:
x = s.isin(["b", "c"])
x

0     True
1     True
2     True
3     True
4     True
5     True
6    False
7    False
8    False
dtype: bool

# Filter and retrieve the values from the Series 's' that correspond to the True values in the boolean Series 'x'.

In [25]:
s[x]

0    b
1    b
2    b
3    b
4    c
5    c
dtype: object