In [199]:
import numpy as np

feature_vector = 5 + np.arange(12)

np.random.seed(42)

feature_matrix = np.random.randint(0, 10, (3, 4))

print(f"Feature Vector: size:{feature_vector.size}",
      f"ndim:{feature_vector.ndim}", 
      f"shape:{feature_vector.shape}",
      f"data type:{feature_vector.dtype}")
print(f"Feature Matrix: size:{feature_matrix.size}",
      f"ndim:{feature_matrix.ndim}", 
      f"shape:{feature_matrix.shape}",
      f"data type:{feature_matrix.dtype}") 

Feature Vector: size:12 ndim:1 shape:(12,) data type:int32
Feature Matrix: size:12 ndim:2 shape:(3, 4) data type:int32


In [200]:
print(f"feature_matrix:\n{feature_matrix}")
print(f"value in row 2 and column 3 of feature_matrix:{feature_matrix[1,2]}")
print(f"row 1 of feature_matrix:{feature_matrix[0]}")
print(f"last column of feature_matrix:{feature_matrix[:,-1]}")
print(f"top right 2x2 submatrix of feature_matrix:\n{feature_matrix[:2, -2:]}")


feature_matrix:
[[6 3 7 4]
 [6 9 2 6]
 [7 4 3 7]]
value in row 2 and column 3 of feature_matrix:2
row 1 of feature_matrix:[6 3 7 4]
last column of feature_matrix:[4 6 7]
top right 2x2 submatrix of feature_matrix:
[[7 4]
 [2 6]]


In [201]:
feature_vector = feature_vector.reshape(4, 3)
print(f"feature_vector shape:{feature_vector.shape}")
print(f"feature_matrix flattened:\n{feature_matrix.flatten()}")

feature_vector shape:(4, 3)
feature_matrix flattened:
[6 3 7 4 6 9 2 6 7 4 3 7]


In [202]:
new_matrix = np.ones((3, 4))
vstacked_matrix = np.vstack((feature_matrix, new_matrix))
print(f"vstacked_matrix:\n{vstacked_matrix}")
hstacked_matrix = np.hstack((feature_matrix, new_matrix))
print(f"hstacked_matrix:\n{hstacked_matrix}")
new_vector = np.array([10, 20, 30]).reshape(3, 1)
stacked_matrix = np.hstack((feature_matrix, new_vector))
print(f"stacked_matrix:\n{stacked_matrix}")

vstacked_matrix:
[[6. 3. 7. 4.]
 [6. 9. 2. 6.]
 [7. 4. 3. 7.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
hstacked_matrix:
[[6. 3. 7. 4. 1. 1. 1. 1.]
 [6. 9. 2. 6. 1. 1. 1. 1.]
 [7. 4. 3. 7. 1. 1. 1. 1.]]
stacked_matrix:
[[ 6  3  7  4 10]
 [ 6  9  2  6 20]
 [ 7  4  3  7 30]]


In [203]:
np.random.seed(101)
scores = np.random.randint(50, 101, (15, 3))
exam_means = np.mean(scores, axis=0)
print(f"exam means:\n{exam_means}")
exam_stds = np.std(scores, axis=0)
print(f"exam stds:\n{exam_stds}")
print(f"max scores in exam 3:\n{np.max(scores, axis=0)[2]}")
print(f"students means:\n{np.mean(scores, axis=1)}")
curved_scores = scores + [5,0,0]
print(f"first exam scores plus 5:\n{curved_scores}")
normized_scores = (scores - exam_means) / exam_stds
print(f"normalized scores:\n exam means:{np.mean(normized_scores, axis=0)}",
      f"exam stds:{np.std(normized_scores, axis=0)}")


exam means:
[76.73333333 70.46666667 73.93333333]
exam stds:
[15.91840305 16.38644426 12.01369589]
max scores in exam 3:
96
students means:
[69.66666667 63.33333333 73.         78.         74.66666667 65.33333333
 86.         78.         78.66666667 63.66666667 82.33333333 66.66666667
 80.33333333 77.33333333 68.66666667]
first exam scores plus 5:
[[ 86  61  67]
 [ 61  73  61]
 [102  59  63]
 [ 95  54  90]
 [ 83  50  96]
 [ 60  62  79]
 [ 95  99  69]
 [102  58  79]
 [ 89  94  58]
 [ 74  60  62]
 [ 86  93  73]
 [ 55  91  59]
 [102  58  86]
 [ 74  85  78]
 [ 62  60  89]]
normalized scores:
 exam means:[-7.40148683e-17 -9.62193288e-17 -2.66453526e-16] exam stds:[1. 1. 1.]


In [204]:
print(f"scores:\n{scores}")
greater_than_95 = scores > 95
print(f"number of scores greater than 95:\n{np.sum(greater_than_95)}")
high_performance_scores = scores[greater_than_95]
print(f"high performance scores:\n{high_performance_scores}")
print(f"exam 2 mean for exam 1 scores greater than 70:\n{np.mean(scores[scores[:, 0] > 70, 1])}")
pass_fail = np.mean(scores, axis=1) > 65
print(f"pass/fail:\n{pass_fail}")
execlent_students = scores[np.all(scores > 90, axis=1)]
if execlent_students.size == 0:
    execlent_students = "No students with all scores above 90"

print(f"execlent students:\n{execlent_students}")

scores:
[[81 61 67]
 [56 73 61]
 [97 59 63]
 [90 54 90]
 [78 50 96]
 [55 62 79]
 [90 99 69]
 [97 58 79]
 [84 94 58]
 [69 60 62]
 [81 93 73]
 [50 91 59]
 [97 58 86]
 [69 85 78]
 [57 60 89]]
number of scores greater than 95:
5
high performance scores:
[97 96 99 97 97]
exam 2 mean for exam 1 scores greater than 70:
69.55555555555556
pass/fail:
[ True False  True  True  True  True  True  True  True False  True  True
  True  True  True]
execlent students:
No students with all scores above 90


In [205]:
import pandas as pd

# use the raw GitHub URL to read the CSV file (the HTML page causes the ParserError)
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [206]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [207]:
print(df["Survived"].dtype)
df["Survived"]

int64


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [208]:
df[["Sex", "Pclass", "Age"]]

Unnamed: 0,Sex,Pclass,Age
0,male,3,22.0
1,female,1,38.0
2,female,3,26.0
3,female,1,35.0
4,male,3,35.0
...,...,...,...
886,male,2,27.0
887,female,1,19.0
888,female,3,
889,male,1,26.0


In [209]:
df.loc[3]

PassengerId                                               4
Survived                                                  1
Pclass                                                    1
Name           Futrelle, Mrs. Jacques Heath (Lily May Peel)
Sex                                                  female
Age                                                    35.0
SibSp                                                     1
Parch                                                     0
Ticket                                               113803
Fare                                                   53.1
Cabin                                                  C123
Embarked                                                  S
Name: 3, dtype: object

In [210]:
df.iloc[:5, :3]

Unnamed: 0,PassengerId,Survived,Pclass
0,1,0,3
1,2,1,1
2,3,1,3
3,4,1,1
4,5,0,3


In [211]:
print(f"survived passengers number: {df['Survived'].sum()}")
survivors = df.loc[df["Survived"] == 1]
survivors.head()

survived passengers number: 342


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [212]:
print(f"male survivors number: {(survivors['Sex'] == 'male').sum()}")

male survivors number: 109


In [213]:
first_class_over_50 = df[(df["Pclass"] == 1) & (df["Age"] > 50)]
first_class_over_50.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
124,125,0,1,"White, Mr. Percival Wayland",male,54.0,0,1,35281,77.2875,D26,S


In [214]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [215]:
df_cleaned = df.drop("Cabin", axis=1)
df_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [216]:
df_cleaned.fillna({"Age" : df["Age"].median()}, inplace=True)
df_cleaned.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [217]:
df_cleaned.fillna({"Embarked" : df["Embarked"].mode()[0]}, inplace=True)
df_cleaned.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [220]:
df_cleaned = df_cleaned.astype({"Pclass": "category"})
df_cleaned.dtypes

PassengerId       int64
Survived          int64
Pclass         category
Name             object
Sex              object
Age             float64
SibSp             int64
Parch             int64
Ticket           object
Fare            float64
Embarked         object
dtype: object

In [221]:
df_cleaned["FamilySize"] = df_cleaned["SibSp"] + df_cleaned["Parch"]
df_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0


In [224]:
df_cleaned["Title"] = df_cleaned["Name"].str.extract(r' ([A-Za-z]+)\.', expand=False)
df_cleaned["Title"].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer'], dtype=object)

In [225]:
df_cleaned.groupby("Sex")["Survived"].mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [228]:
df_cleaned.groupby("Pclass")["Survived"].mean()

  df_cleaned.groupby("Pclass")["Survived"].mean()


Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

In [229]:
df_cleaned.groupby(["Sex","Pclass"])["Survived"].mean()

  df_cleaned.groupby(["Sex","Pclass"])["Survived"].mean()


Sex     Pclass
female  1         0.968085
        2         0.921053
        3         0.500000
male    1         0.368852
        2         0.157407
        3         0.135447
Name: Survived, dtype: float64

In [230]:
def age_group(age):
    if age < 18:
        return "Child"
    elif age < 65:
        return "Adult"
    else:
        return "Senior"
    
df_cleaned["AgeGroup"] = df_cleaned["Age"].apply(age_group)
df_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,Title,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,Mr,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,Mrs,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,Miss,Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,Mrs,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,Mr,Adult
