## Data Engineering Pandas (Data Frames)

In [47]:
import pandas as pd
import numpy as np

##### 1-Create a CSV file with the following data:

In [48]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Danielle'],
    'Age': [25, 30, 35, 40],
    'Gender': ['Female', 'Male', 'Male', 'Female'],
    'Occupation': ['Engineer', 'Scientist', 'Artist', 'Teacher']
}
df = pd.DataFrame(data)

df.to_csv('data.csv', index=False)


#### 2. Read the CSV file into a DataFrame using pandas.

In [49]:
df = pd.read_csv('data.csv')
print(df)


       Name  Age  Gender Occupation
0     Alice   25  Female   Engineer
1       Bob   30    Male  Scientist
2   Charlie   35    Male     Artist
3  Danielle   40  Female    Teacher


#### 3. Extract the 'Age' and 'Occupation' columns into a new DataFrame.

In [50]:
df = pd.read_csv('data.csv')
new_df = df[['Age', 'Occupation']]
print(new_df)


   Age Occupation
0   25   Engineer
1   30  Scientist
2   35     Artist
3   40    Teacher


#### 4. Try to extract a column that doesn't exist in the DataFrame and observe the error message.

In [51]:
column_name = 'Salary'
try:
    column = df[column_name]
except KeyError as e:
    print(f"Error: {e}")


Error: 'Salary'


#### 5. Assign the value 50 to the entire 'Age' column.

In [52]:
df['Age'] = 50
df

Unnamed: 0,Name,Age,Gender,Occupation
0,Alice,50,Female,Engineer
1,Bob,50,Male,Scientist
2,Charlie,50,Male,Artist
3,Danielle,50,Female,Teacher


#### 6. Try to assign random values for the Salary

In [53]:
df = pd.read_csv('data.csv')
df['Salary'] = np.random.randint(30000, 80000, size=len(df))
df

Unnamed: 0,Name,Age,Gender,Occupation,Salary
0,Alice,25,Female,Engineer,69188
1,Bob,30,Male,Scientist,47568
2,Charlie,35,Male,Artist,49769
3,Danielle,40,Female,Teacher,58693


#### 7. Add a new column called 'Kids' to the DataFrame with the following values: [2, 4, 0, 5]

In [54]:
kids_values = [2, 4, 0, 5]
df['Kids'] = kids_values
print(df)

       Name  Age  Gender Occupation  Salary  Kids
0     Alice   25  Female   Engineer   69188     2
1       Bob   30    Male  Scientist   47568     4
2   Charlie   35    Male     Artist   49769     0
3  Danielle   40  Female    Teacher   58693     5


#### 8. Delete the 'Gender' column from the DataFrame.

In [55]:
df = df.drop('Gender', axis=1)
df

Unnamed: 0,Name,Age,Occupation,Salary,Kids
0,Alice,25,Engineer,69188,2
1,Bob,30,Scientist,47568,4
2,Charlie,35,Artist,49769,0
3,Danielle,40,Teacher,58693,5


#### 9. Save the modified DataFrame to a new CSV file

In [56]:
df.to_csv('modified_data.csv', index=False)

## Data Engineering (Reindexing)

#### Exercise for Series reindexing:

#### 1. Create a Series with the following data: data = [10, 20, 30, 40, 50]

In [57]:
data = [10, 20, 30, 40, 50]
series = pd.Series(data)
series

0    10
1    20
2    30
3    40
4    50
dtype: int64

##### 2. Assign the following index labels to the Series: index = ['a', 'b', 'c', 'd', 'e']

In [58]:
data = [10, 20, 30, 40, 50]
index = ['a', 'b', 'c', 'd', 'e']
series = pd.Series(data, index=index)
print(series)


a    10
b    20
c    30
d    40
e    50
dtype: int64


#### 3. Get the index of the Series using the index attribute

In [59]:
series_index = series.index
series_index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

#### 4. Create a new index for the Series using the following labels: new_index = ['a', 'b', 'c', 'd', 'e', 'f', 
'g']

In [60]:
data = [10, 20, 30, 40, 50]
index = ['a', 'b', 'c', 'd', 'e']
new_index = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
new_index

['a', 'b', 'c', 'd', 'e', 'f', 'g']

#### 5. Call the reindex method on the Series with the new index and observe the output.

In [61]:
series = pd.Series(data, index=index)
new_series = series.reindex(new_index)
new_series

a    10.0
b    20.0
c    30.0
d    40.0
e    50.0
f     NaN
g     NaN
dtype: float64

#### 6. Fill in values for the new indexes using a value of your choice

In [68]:
fill_value = 0
series = pd.Series(data, index=index)
new_series = series.reindex(new_index, fill_value=fill_value)
new_series

a    NaN
b    NaN
c    NaN
d    NaN
e    NaN
f      0
g      0
dtype: object

#### 7. Use the ffill method to fill the new indexes with the previous values in the Series

In [63]:
series = pd.Series(data, index=index)
new_series = series.reindex(new_index).ffill()
new_series

a    10.0
b    20.0
c    30.0
d    40.0
e    50.0
f    50.0
g    50.0
dtype: float64

### Exercise for DataFrame reindexing

### 1. Create a DataFrame with the following random values:
##### import pandas as pd
##### import numpy as np
##### np.random.seed(123)
##### data = {'A': np.random.randint(1, 10, 5), 'B': np.random.randint(1, 10, 5)}
##### df = pd.DataFrame(data)

In [64]:
import pandas as pd
import numpy as np

In [70]:
np.random.seed(123)
data = {'A': np.random.randint(1, 10, 5), 'B': np.random.randint(1, 10, 5)}
df = pd.DataFrame(data)
df


Unnamed: 0,A,B
0,3,7
1,3,2
2,7,1
3,2,2
4,4,1


#### 1. Get the current index and columns of the DataFrame using the index and columns attributes

In [69]:
current_index = df.index
current_columns = df.columns
print("Current Index:", current_index)
print("Current Columns:", current_columns)

Current Index: RangeIndex(start=0, stop=5, step=1)
Current Columns: Index(['A', 'B'], dtype='object')


#### 2. Create a new index for the DataFrame using the following labels: new_index = ['a', 'b', 'c', 'd', 'e']

In [80]:
new_index = ['a', 'b', 'c', 'd', 'e']


##### 3. Call the reindex method on the DataFrame with the new index and observe the output.

In [81]:
df_reindexed = df.reindex(new_index)

In [82]:
df_reindexed

Unnamed: 0,A,B
a,,
b,,
c,,
d,,
e,,


#### 4. Fill in values for the new indexes using a value of your choice.

In [79]:
fill_value = 3
df_filled = df_reindexed.fillna(fill_value)
df_filled

Unnamed: 0,A,B
a,3.0,3.0
b,3.0,3.0
c,3.0,3.0
d,3.0,3.0
e,3.0,3.0


##### 5. Use the ffill method to fill the new indexes with the previous values in the DataFrame's columns

Unnamed: 0,A,B
a,,
b,,
c,,
d,,
e,,


##### Data Engineering (Selecting Entries)

##### Exercise for Series:

##### 1. Create a Series with the following data: data = [5, 10, 15, 20, 25, 30], index = ['a', 'b', 'c', 'd', 'e', 
'f']

In [87]:
data = [5, 10, 15, 20, 25, 30]
index = ['a', 'b', 'c', 'd', 'e', 'f']
series = pd.Series(data, index=index)
series


a     5
b    10
c    15
d    20
e    25
f    30
dtype: int64

##### 2. Grab the entry with index name 'c' from the Series.

In [88]:
entry_c = series.loc['c']
print(entry_c)


15


##### 3. Grab entries in the Series with index values ['a', 'c', 'f']

In [89]:
entries = series.loc[['a', 'c', 'f']]
entries

a     5
c    15
f    30
dtype: int64

###### 4. Grab entries in the Series based on index logic: entries greater than 15.

In [90]:
entries_greater_than_15 = series[series > 15]
entries_greater_than_15

d    20
e    25
f    30
dtype: int64

###### 5. replace the value 10 by 100

In [91]:
series_replaced = series.replace(10, 100)
print(series_replaced)

a      5
b    100
c     15
d     20
e     25
f     30
dtype: int64


###### Exercise for DataFrames:

###### 1. Create a DataFrame with the following data: data = {'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50], 'C': 
[100, 200, 300, 400, 500]}, index = ['a', 'b', 'c', 'd', 'e']

In [92]:
data = {'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50], 'C': [100, 200, 300, 400, 500]}
index = ['a', 'b', 'c', 'd', 'e']
df = pd.DataFrame(data, index=index)
print(df)


   A   B    C
a  1  10  100
b  2  20  200
c  3  30  300
d  4  40  400
e  5  50  500


###### 2. Grab the row with index label 'c' from the DataFrame.

In [93]:
row_c = df.loc['c']
print(row_c)


A      3
B     30
C    300
Name: c, dtype: int64


###### 3. Grab rows in the DataFrame with index range 'b' to 'e'

In [94]:
rows_b_2_e = df.loc['b':'e']
print(rows_b_2_e )


   A   B    C
b  2  20  200
c  3  30  300
d  4  40  400
e  5  50  500


###### 4. Grab rows in the DataFrame with index values ['a', 'c', 'e'].

In [95]:
rows = df.loc[['a', 'c', 'e']]
print(rows)

   A   B    C
a  1  10  100
c  3  30  300
e  5  50  500


###### 5. Grab rows in the DataFrame based on index logic: rows where the value in column 'A' is greater than 2

In [96]:
rows_greater_than_2 = df[df['A'] > 2]
print(rows_greater_than_2)

   A   B    C
c  3  30  300
d  4  40  400
e  5  50  500


##### 6. Grab rows in the DataFrame based on index logic: rows where the value in column 'A' is greater than 2 and 'B' less than 50

In [97]:
logic_conditions = df[(df['A'] > 2) & (df['B'] < 50)]
print(logic_conditions)


   A   B    C
c  3  30  300
d  4  40  400


#### 7. Drop column 'A' from the data frame

In [100]:
df_dropped = df.drop('A', axis=1)
df_dropped


Unnamed: 0,B,C
a,10,100
b,20,200
c,30,300
d,40,400
e,50,500


###### 8. drop raw 'e' from the data frame

In [101]:
df_dropped = df.drop('e', axis=0)
print(df_dropped)

   A   B    C
a  1  10  100
b  2  20  200
c  3  30  300
d  4  40  400
