In [99]:
import pandas as pd
import numpy as np
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-statistical-methods-in-pandas\titanic.csv', index_col = 0)
# index_col = 0: sets the index_col to a specific col
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,?,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


`df.info()` is used to get summary metadata about a dataframe

It helps us get dataframe-level summary of statistics.

It gives us info like:
- The number of columns and rows in the DataFrame
- The data type of the data each column contains
- How many values each column contains (NaNs are not counted)
- The memory footprint of the DataFrame

In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    object 
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 90.5+ KB


the `df.describe()` method is used to dig into the summary statistics of the dataset

`df.describe()` defaulty only summarizes the numerical columns. To describe only categorical data we use `df.describe(include = "O")`

In [101]:
df.describe()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,38.0,1.0,0.0,31.0
max,891.0,1.0,80.0,8.0,6.0,512.3292


In [102]:
# Calculate the mean of all columns
df.mean()

  df.mean()


PassengerId    446.000000
Survived         0.383838
Age             29.699118
SibSp            0.523008
Parch            0.381594
Fare            32.204208
dtype: float64

In [103]:
# Get the mean of a specific column
df['Fare'].mean()

32.2042079685746

In [104]:
# Get the 90th percentile for Age
df['Age'].quantile(0.90)

50.0

In [105]:
# Get the median for all
df['Age'].median()

28.0

**Other common statistics that can be used**
- .mode() -- the mode of the column
- .count() -- the count of the total number of entries in a column
- .std() -- the standard deviation for the column
- .var() -- the variance for the column
- .sum() -- the sum of all values in the column
- .cumsum() -- the cumulative sum, where each cell index contains the sum of all indices lower than, and including, itself.

#### For Categorical Columns

The above statistics cannot be used for non-numeric data but there are other methods that can be used

In [106]:
#.unique() - shows us all the unique values in the col
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [107]:
#.value_counts - shows the count of every unique value in a col
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

`.apply()` is used to pass lambda functions to a Panda Series and `.applymap()` is used when working with Panda DataFrames

They don't mutate the original dataset but return a copy containing the result

In [108]:
# Function to convert every value in the DataFrame
# to a string
string_df = df.applymap(lambda x: str(x))
string_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  891 non-null    object
 1   Survived     891 non-null    object
 2   Pclass       891 non-null    object
 3   Name         891 non-null    object
 4   Sex          891 non-null    object
 5   Age          891 non-null    object
 6   SibSp        891 non-null    object
 7   Parch        891 non-null    object
 8   Ticket       891 non-null    object
 9   Fare         891 non-null    object
 10  Cabin        891 non-null    object
 11  Embarked     891 non-null    object
dtypes: object(12)
memory usage: 90.5+ KB


In [109]:
# Square every value in the Age column
df['Age'].apply(lambda x: x*x)

0       484.0
1      1444.0
2       676.0
3      1225.0
4      1225.0
        ...  
886     729.0
887     361.0
888       NaN
889     676.0
890    1024.0
Name: Age, Length: 891, dtype: float64

In [110]:
# Df['Age'] is unaffected
df['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

# Statistics in Pandas Lab

Import the 'lego_sets.csv' dataset and display the first five rows of the DataFrame to get a feel for what we'll be working with

In [111]:
df = pd.read_csv(r'C:\Users\nrmmw\Documents\Flatiron\dsc-statistical-methods-in-pandas-lab\lego_sets.csv')
df

Unnamed: 0,ages,list_price,num_reviews,piece_count,play_star_rating,prod_desc,prod_id,prod_long_desc,review_difficulty,set_name,star_rating,theme_name,val_star_rating,country
0,6-12,29.9900,2.0,277.0,4.0,Catapult into action and take back the eggs fr...,75823.0,Use the staircase catapult to launch Red into ...,Average,Bird Island Egg Heist,4.5,Angry Birds™,4.0,US
1,6-12,19.9900,2.0,168.0,4.0,Launch a flying attack and rescue the eggs fro...,75822.0,Pilot Pig has taken off from Bird Island with ...,Easy,Piggy Plane Attack,5.0,Angry Birds™,4.0,US
2,6-12,12.9900,11.0,74.0,4.3,Chase the piggy with lightning-fast Chuck and ...,75821.0,Pitch speedy bird Chuck against the Piggy Car....,Easy,Piggy Car Escape,4.3,Angry Birds™,4.1,US
3,12+,99.9900,23.0,1032.0,3.6,Explore the architecture of the United States ...,21030.0,Discover the architectural secrets of the icon...,Average,United States Capitol Building,4.6,Architecture,4.3,US
4,12+,79.9900,14.0,744.0,3.2,Recreate the Solomon R. Guggenheim Museum® wit...,21035.0,Discover the architectural secrets of Frank Ll...,Challenging,Solomon R. Guggenheim Museum®,4.6,Architecture,4.1,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12256,7-14,36.5878,6.0,341.0,4.4,Protect NINJAGO® City from flying Manta Ray Bo...,70609.0,Help Cole save Shen-Li in this cool THE LEGO® ...,Easy,Manta Ray Bomber,4.3,THE LEGO® NINJAGO® MOVIE™,4.2,PT
12257,7-14,24.3878,8.0,217.0,4.1,Stop a Piranha Attack with Kai and Misako!,70629.0,Play out an action-packed Piranha Mech pursuit...,Easy,Piranha Attack,3.6,THE LEGO® NINJAGO® MOVIE™,4.1,PT
12258,7-14,24.3878,18.0,233.0,4.6,Stop a crime in the NINJAGO® City street market!,70607.0,"Team up with Lloyd Garmadon, Nya and Officer T...",Easy,NINJAGO® City Chase,4.6,THE LEGO® NINJAGO® MOVIE™,4.5,PT
12259,6-14,12.1878,1.0,48.0,5.0,Achieve Spinjitzu greatness with the Green Ninja!,70628.0,Learn all the skills of Spinjitzu with THE LEG...,Very Easy,Lloyd - Spinjitzu Master,5.0,THE LEGO® NINJAGO® MOVIE™,5.0,PT


In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12261 entries, 0 to 12260
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ages               12261 non-null  object 
 1   list_price         12261 non-null  float64
 2   num_reviews        10641 non-null  float64
 3   piece_count        12261 non-null  float64
 4   play_star_rating   10486 non-null  float64
 5   prod_desc          11884 non-null  object 
 6   prod_id            12261 non-null  float64
 7   prod_long_desc     12261 non-null  object 
 8   review_difficulty  10206 non-null  object 
 9   set_name           12261 non-null  object 
 10  star_rating        10641 non-null  float64
 11  theme_name         12258 non-null  object 
 12  val_star_rating    10466 non-null  float64
 13  country            12261 non-null  object 
dtypes: float64(7), object(7)
memory usage: 1.3+ MB


In [113]:
# How many total rows are in this DataFrame?
print(f"The df has {df.shape[0]} rows")

The df has 12261 rows


In [114]:
# Numerical Columns
# How many columns contain numeric data?
df_num = df.select_dtypes(include= "number").head()
df_num.columns

Index(['list_price', 'num_reviews', 'piece_count', 'play_star_rating',
       'prod_id', 'star_rating', 'val_star_rating'],
      dtype='object')

In [115]:
# Categorical Data
# How many contain categorical data?
df_obj = df.select_dtypes(include = "O")
df_obj.columns

Index(['ages', 'prod_desc', 'prod_long_desc', 'review_difficulty', 'set_name',
       'theme_name', 'country'],
      dtype='object')

In [116]:
# Identify at least 3 columns that contain missing values
# Shika zote
df.isna().sum()

ages                    0
list_price              0
num_reviews          1620
piece_count             0
play_star_rating     1775
prod_desc             377
prod_id                 0
prod_long_desc          0
review_difficulty    2055
set_name                0
star_rating          1620
theme_name              3
val_star_rating      1795
country                 0
dtype: int64

In [117]:
df.describe()

Unnamed: 0,list_price,num_reviews,piece_count,play_star_rating,prod_id,star_rating,val_star_rating
count,12261.0,10641.0,12261.0,10486.0,12261.0,10641.0,10466.0
mean,65.141998,16.826238,493.405921,4.337641,59836.75,4.514134,4.22896
std,91.980429,36.368984,825.36458,0.652051,163811.5,0.518865,0.660282
min,2.2724,1.0,1.0,1.0,630.0,1.8,1.0
25%,19.99,2.0,97.0,4.0,21034.0,4.3,4.0
50%,36.5878,6.0,216.0,4.5,42069.0,4.7,4.3
75%,70.1922,13.0,544.0,4.8,70922.0,5.0,4.7
max,1104.87,367.0,7541.0,5.0,2000431.0,5.0,5.0


In [118]:
# How much is the standard deviation for piece count?
df['piece_count'].std()

825.364580411521

In [119]:
# How many pieces are in the largest lego set?
df['piece_count'].max()

7541.0

In [120]:
# How many in the smallest lego set
df['piece_count'].min()

1.0

In [121]:
# What is the median val_star_rating?
df['val_star_rating'].median()

4.3

In [122]:
# Calculate the median of the star_rating column
df['star_rating'].median()

4.7

In [123]:
# Print the number of unique values in play_star_rating
print(df['play_star_rating'].unique())

[4.  4.3 3.6 3.2 3.7 4.4 4.1 4.2 3.8 4.7 3.  5.  2.  nan 4.6 2.7 4.5 1.
 3.5 3.3 3.9 4.8 2.9 3.4 4.9 2.5 2.2 2.3 2.8 3.1 2.1]


In [124]:
# Calculate the std of the list_price column
df['list_price'].std()

91.9804293059252

If we bought every single lego set in this dataset, how many pieces would we have?

In [170]:
# Total number of pieces across all unique Lego sets
# subset selects the column to use to drop duplicates
df_legoset = df[['set_name', 'piece_count']].drop_duplicates(subset = 'set_name')

In [166]:
df_legoset

Unnamed: 0,set_name,piece_count
0,Bird Island Egg Heist,277.0
1,Piggy Plane Attack,168.0
2,Piggy Car Escape,74.0
3,United States Capitol Building,1032.0
4,Solomon R. Guggenheim Museum®,744.0
...,...,...
1559,Wicked Witch™ Fun Pack,38.0
1565,Vaiana's Ocean Voyage,307.0
3167,Slave I™,1996.0
7428,Moana's Ocean Voyage,307.0


In [169]:
df_legoset['piece_count'].sum()

319067.0

Now, let's try getting the value for the 90% quantile for all numerical columns.

In [171]:
df.quantile(0.9)

list_price            136.2971
num_reviews            38.0000
piece_count          1077.0000
play_star_rating        5.0000
prod_id             75531.0000
star_rating             5.0000
val_star_rating         5.0000
Name: 0.9, dtype: float64

Print the unique values contained within the review_difficulty column.

In [174]:
df['review_difficulty'].unique()

array(['Average', 'Easy', 'Challenging', 'Very Easy', nan,
       'Very Challenging'], dtype=object)

Now, let's get the value_counts() for this column, to see how common each is.

In [175]:
df['review_difficulty'].value_counts()

Easy                4236
Average             3765
Very Easy           1139
Challenging         1058
Very Challenging       8
Name: review_difficulty, dtype: int64

Use applymap() to return a version of the DataFrame where every value has been converted to a string.

In [177]:
string_df = df.applymap(lambda x: str(x))
string_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12261 entries, 0 to 12260
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ages               12261 non-null  object
 1   list_price         12261 non-null  object
 2   num_reviews        12261 non-null  object
 3   piece_count        12261 non-null  object
 4   play_star_rating   12261 non-null  object
 5   prod_desc          12261 non-null  object
 6   prod_id            12261 non-null  object
 7   prod_long_desc     12261 non-null  object
 8   review_difficulty  12261 non-null  object
 9   set_name           12261 non-null  object
 10  star_rating        12261 non-null  object
 11  theme_name         12261 non-null  object
 12  val_star_rating    12261 non-null  object
 13  country            12261 non-null  object
dtypes: object(14)
memory usage: 1.3+ MB
