TASK: 
Generated a Synthetic Dataset with columns of age, salary, department, years_experience, and is_manager

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(42) # For reproducibility
n_samples = 100 # Number of samples

data = {
    'age': np.random.randint(18, 60, size=n_samples),
    'salary': np.random.randint(30000, 120000, size=n_samples),
    'department': np.random.choice(['IT', 'HR', 'Finance', 'Marketing'], size=n_samples),
    'years_experience': np.round(np.random.normal(5, 2, size=n_samples), 1),
    'is_manager': np.random.choice([0, 1], size=n_samples)
}
df = pd.DataFrame(data)

Q1: View Data Structure

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               100 non-null    int32  
 1   salary            100 non-null    int32  
 2   department        100 non-null    object 
 3   years_experience  100 non-null    float64
 4   is_manager        100 non-null    int32  
dtypes: float64(1), int32(3), object(1)
memory usage: 2.9+ KB


Q2: Get DataFrame Info and Summary stats

In [4]:
df.describe()

Unnamed: 0,age,salary,years_experience,is_manager
count,100.0,100.0,100.0,100.0
mean,37.91,77809.16,4.823,0.47
std,12.219454,26058.643576,2.237822,0.501614
min,18.0,30206.0,-0.8,0.0
25%,26.75,55141.0,3.475,0.0
50%,38.0,80932.0,4.7,0.0
75%,46.25,98107.25,6.0,1.0
max,59.0,119474.0,11.4,1.0


Q3: Do Simple numpy operations

In [8]:
#creating array using numpy
arr1 = np.array([2,4,6,8,10])
print(arr1)
arr2 = np.array(([1,2,3,4,5],[2,4,6,8,10]))
print(arr2)
#Array shape
print(arr2.shape)
#zero array
zero_array = np.zeros((3,3))
print(zero_array)
#identity array
identity_array = np.identity((2))
print(identity_array)
arr = np.random.randint(1,15,10)
print(arr)
#Indexing
print(arr2[1])
#Slicing
print(arr1[1:4])
#Filtering
print(arr[arr>8])

[ 2  4  6  8 10]
[[ 1  2  3  4  5]
 [ 2  4  6  8 10]]
(2, 5)
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[1. 0.]
 [0. 1.]]
[ 4  9  6 13 11  3  1  4 12 13]
[ 2  4  6  8 10]
[4 6 8]
[ 9 13 11 12 13]


Q4: Filtering and Indexing rows

In [15]:
#loc
print(df.loc[(df.is_manager==1) & (df.department=='HR')])

    age  salary department  years_experience  is_manager
2    32  108603         HR               5.0           1
3    25   82256         HR               4.2           1
4    38  119135         HR               4.1           1
13   41   38110         HR               2.0           1
45   45   78925         HR               4.1           1
51   53   45707         HR               4.0           1
85   18   68623         HR               4.1           1
88   28   76717         HR               3.6           1
96   56   42688         HR               4.4           1
99   24   97863         HR               5.2           1


In [16]:
#iloc
print(df.iloc[20:26, :])

    age  salary department  years_experience  is_manager
20   19  117054  Marketing               6.7           0
21   38  117897    Finance               5.8           0
22   50   53419         HR               5.0           0
23   29   80636    Finance               1.9           0
24   39   80015    Finance               8.0           1
25   42   84268    Finance               5.8           0


Q5: Adding a column

In [18]:
df['salary_per_year'] = df['salary'] / (df['years_experience'] + 1)
df['salary_per_year']

0     191960.000000
1      13757.954545
2      18100.500000
3      15818.461538
4      23359.803922
          ...      
95     16532.400000
96      7905.185185
97      7906.000000
98      5415.887097
99     15784.354839
Name: salary_per_year, Length: 100, dtype: float64

Q6: Grouping and Aggregation

In [22]:
df.groupby('department')['age'].value_counts()

department  age
Finance     19     2
            38     2
            42     2
            57     2
            21     1
                  ..
Marketing   53     1
            54     1
            56     1
            57     1
            59     1
Name: count, Length: 82, dtype: int64

In [37]:
df.groupby(['department',df['salary']>50000]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,salary,years_experience,is_manager,salary_per_year
department,salary,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Finance,False,155,121798,7.7,1,37881.916667
Finance,True,791,1873195,98.3,9,376451.127402
HR,False,236,198672,19.0,3,43618.65606
HR,True,470,1198266,77.7,7,199656.45163
IT,False,196,151663,12.5,3,222226.984127
IT,True,577,1440672,79.7,10,308061.792971
Marketing,False,203,187045,32.5,1,26665.270441
Marketing,True,1163,2609605,154.9,13,496838.088688
