In [1]:
# 7.2 Data Transformation
import pandas as pd
import numpy as np
# Create a DataFrame with random data

data = pd.DataFrame({"Viridescent Venereer": ["flower","feather"]* 3 + ["feather"],
                     "Emblem of Severe Fate": [1, 1,2,3,3,4,4]})
# Add a new column with random values

data 



Unnamed: 0,Viridescent Venereer,Emblem of Severe Fate
0,flower,1
1,feather,1
2,flower,2
3,feather,3
4,flower,3
5,feather,4
6,feather,4


In [2]:
data.duplicated() # Check for duplicates in the DataFrame

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [3]:
data.drop_duplicates(inplace=True)  # Remove duplicates
# Display the DataFrame
data 

Unnamed: 0,Viridescent Venereer,Emblem of Severe Fate
0,flower,1
1,feather,1
2,flower,2
3,feather,3
4,flower,3
5,feather,4


In [4]:
data['Thundering Fury'] = range(len(data)) # Add a new column with a range of values matching the number of rows
data  

Unnamed: 0,Viridescent Venereer,Emblem of Severe Fate,Thundering Fury
0,flower,1,0
1,feather,1,1
2,flower,2,2
3,feather,3,3
4,flower,3,4
5,feather,4,5


In [5]:
data.drop_duplicates(subset = ["Viridescent Venereer"])  
# Remove duplicates

Unnamed: 0,Viridescent Venereer,Emblem of Severe Fate,Thundering Fury
0,flower,1,0
1,feather,1,1


In [6]:
data.drop_duplicates(['Viridescent Venereer','Emblem of Severe Fate'],
                     keep= "last")  
# Remove duplicates based on multiple columns,
#  keeping the last occurrence

Unnamed: 0,Viridescent Venereer,Emblem of Severe Fate,Thundering Fury
0,flower,1,0
1,feather,1,1
2,flower,2,2
3,feather,3,3
4,flower,3,4
5,feather,4,5


In [7]:
data1 = pd.DataFrame({"Food": ["Burger","Beef Burger", "Cheese Burger",
                               "Chicken Burger","Fish Burger", "Spagetti",
                               "Pizza", "Pasta","Mpzarella Pizza"],
                      "Price": [4,3,12,6,7.5,8,3,5,6]})
# Display the DataFrame
data1

Unnamed: 0,Food,Price
0,Burger,4.0
1,Beef Burger,3.0
2,Cheese Burger,12.0
3,Chicken Burger,6.0
4,Fish Burger,7.5
5,Spagetti,8.0
6,Pizza,3.0
7,Pasta,5.0
8,Mpzarella Pizza,6.0


In [8]:
# suppose we want to add a new column indicating 
# the type of animal that the food is made of

data1['Food Source'] = np.where(data1['Food'].str.contains("Fish"), "Salmon",
                   np.where(data1['Food'].str.contains("Burger"), "Beef",
                   np.where(data1['Food'].str.contains("Pizza"), "Cow",
                   np.where(data1['Food'].str.contains("Spagetti"), "Wheat",
                   np.where(data1['Food'].str.contains("Pasta"), "Wheat", "Unknown")))))

# Display the DataFrame with the new column
data1

Unnamed: 0,Food,Price,Food Source
0,Burger,4.0,Beef
1,Beef Burger,3.0,Beef
2,Cheese Burger,12.0,Beef
3,Chicken Burger,6.0,Beef
4,Fish Burger,7.5,Salmon
5,Spagetti,8.0,Wheat
6,Pizza,3.0,Cow
7,Pasta,5.0,Wheat
8,Mpzarella Pizza,6.0,Cow


In [9]:
def get_food_source(food):
    return food_source.get(food, "Unknown")
# Create a dictionary to map food items to their sources
food_source = {
    "Burger": "Beef",
    "Beef Burger": "Beef",
    "Cheese Burger": "Cow",
    "Chicken Burger": "Chicken",
    "Fish Burger": "Salmon",
    "Spagetti": "Wheat",
    "Pizza": "Cow",
    "Pasta": "Wheat",
    "Mozzarella Pizza": "Cow"
}
# Apply the function to create a new column
data1['Food Source'] = data1['Food'].apply(get_food_source)
# Display the DataFrame with the new column
data1

Unnamed: 0,Food,Price,Food Source
0,Burger,4.0,Beef
1,Beef Burger,3.0,Beef
2,Cheese Burger,12.0,Cow
3,Chicken Burger,6.0,Chicken
4,Fish Burger,7.5,Salmon
5,Spagetti,8.0,Wheat
6,Pizza,3.0,Cow
7,Pasta,5.0,Wheat
8,Mpzarella Pizza,6.0,Unknown


In [10]:
data1["Food"].map(get_food_source)

0       Beef
1       Beef
2        Cow
3    Chicken
4     Salmon
5      Wheat
6        Cow
7      Wheat
8    Unknown
Name: Food, dtype: object

In [11]:
# Ensure data1 exists before replacing values
if 'data1' in locals():
    data1['Food Source'].replace({"Beef": "Cow", "Chicken": "Poultry"}, 
                                 inplace=True)
    # Display the DataFrame with the replaced values
    display(data1)
else:
    print("data1 is not defined. Please run the previous cells that define data1.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['Food Source'].replace({"Beef": "Cow", "Chicken": "Poultry"},


Unnamed: 0,Food,Price,Food Source
0,Burger,4.0,Cow
1,Beef Burger,3.0,Cow
2,Cheese Burger,12.0,Cow
3,Chicken Burger,6.0,Poultry
4,Fish Burger,7.5,Salmon
5,Spagetti,8.0,Wheat
6,Pizza,3.0,Cow
7,Pasta,5.0,Wheat
8,Mpzarella Pizza,6.0,Unknown


In [12]:
data2 = pd.Series([1., -999, 2., -999, -1000, 3.])
# Replace -999 and -1000 with NaN
data2.replace([-999, -1000], np.nan, inplace=True)

In [13]:
# Display the Series after replacement
data2
# Replace NaN with the mean of the Series

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [14]:
data2.fillna(data2.mean(), inplace=True)
# Display the Series after filling NaN values
data2
# Create a DataFrame with random data

0    1.0
1    2.0
2    2.0
3    2.0
4    2.0
5    3.0
dtype: float64

In [15]:
data2 = pd.Series([1., -999, 2., -999, -1000, 3.])
data2.replace({-999: np.nan, -1000: 5}, inplace=True)
# Display the DataFrame after replacement
data2

0    1.0
1    NaN
2    2.0
3    NaN
4    5.0
5    3.0
dtype: float64

In [16]:
# Renaming Axis Index Labels

data = pd.DataFrame(np.arange(15).reshape((3, 5)),
                     index=["Viridescent Venereer", "Emblem of Severe Fate", "Thundering Fury"],
                     columns=["Flower", "Feather", "Watch", "Goblet", "Circlet"])
# Mapping axis labels to new values
def map_labels(label):
    return label[:4].upper()  
# Example mapping function to uppercase first 4 characters

data.index = data.index.map(map_labels)  # Apply mapping to index

In [17]:
display(data)  # Display the DataFrame with updated index

Unnamed: 0,Flower,Feather,Watch,Goblet,Circlet
VIRI,0,1,2,3,4
EMBL,5,6,7,8,9
THUN,10,11,12,13,14


In [18]:
data.rename(index=str.title, columns=str.upper,inplace=True)  
# Rename index and columns to title case and uppercase respectively
# Display the DataFrame after renaming
display(data)  # Display the DataFrame after renaming

Unnamed: 0,FLOWER,FEATHER,WATCH,GOBLET,CIRCLET
Viri,0,1,2,3,4
Embl,5,6,7,8,9
Thun,10,11,12,13,14


In [19]:
data.rename(index=str.lower, columns=str.title,inplace=True) 
# Rename index to lowercase and columns to title case
# Display the DataFrame after renaming
display(data)   

Unnamed: 0,Flower,Feather,Watch,Goblet,Circlet
viri,0,1,2,3,4
embl,5,6,7,8,9
thun,10,11,12,13,14


In [20]:
data.rename(index={"Viridescent Venereer": "VV",
                    "Emblem of Severe Fate": "ESF",
                    "Thundering Fury": "TF"},
             columns={"Flower": "F", "Feather": "Fe", 
                        "Watch": "W", "Goblet": "G", "Circlet": "C"},
             inplace=True)  # Rename specific index and columns
# Display the DataFrame after renaming
display(data)  # Display the DataFrame after renaming

Unnamed: 0,F,Fe,W,G,C
viri,0,1,2,3,4
embl,5,6,7,8,9
thun,10,11,12,13,14


In [21]:
# Discrete Binning
data3 = pd.DataFrame({"Artifact": ["flower","feather"]* 3 + ["feather"],
                     "Emblem of Severe Fate": [1, 1,2,3,3,4,5]})
# Create bins for the 'Emblem of Severe Fate' column
bins = [0, 1, 2, 3, 4, 5, 6] # Define the bin edges
# Create labels for the bins
labels = ["Very Low", "Low", "Medium", "High", "Very High", "Gold"]
# Use pd.cut to bin the data
data3['Binned Emblem'] = pd.cut(data3['Emblem of Severe Fate'], 
                                 bins=bins, labels=labels, right=False)
# Display the DataFrame with the binned column
data3

Unnamed: 0,Artifact,Emblem of Severe Fate,Binned Emblem
0,flower,1,Low
1,feather,1,Low
2,flower,2,Medium
3,feather,3,High
4,flower,3,High
5,feather,4,Very High
6,feather,5,Gold


In [22]:
ages = [ 22, 25, 27, 30, 32, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]
# Create bins for the ages
bins_himpunan = [18, 25, 35, 60, 100]
# Create labels for the bins
Label_indikator = ["Young Adult", "Adult", "Middle Aged", "Senior"]
# Use pd.cut to bin the ages
age_groups = pd.cut(ages, bins=bins_himpunan, labels=Label_indikator, right=False)
# Display the age groups
display(age_groups) 

['Young Adult', 'Adult', 'Adult', 'Adult', 'Adult', ..., 'Senior', 'Senior', 'Senior', 'Senior', 'Senior']
Length: 15
Categories (4, object): ['Young Adult' < 'Adult' < 'Middle Aged' < 'Senior']

In [23]:
age_categories = pd.cut(ages, bins=bins_himpunan)

# Display the age categories
display(age_categories)

[(18, 25], (18, 25], (25, 35], (25, 35], (25, 35], ..., (35, 60], (60, 100], (60, 100], (60, 100], (60, 100]]
Length: 15
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [24]:
age_groups.value_counts()  # Count the occurrences in each age group
# Display the counts of each age group


Young Adult    1
Adult          4
Middle Aged    5
Senior         5
Name: count, dtype: int64

In [25]:
age_categories.value_counts()  # Count the occurrences in each age category
# Display the counts of each age category

(18, 25]     2
(25, 35]     4
(35, 60]     5
(60, 100]    4
Name: count, dtype: int64

In [26]:
age_groups.codes  # Get the codes for the age groups
# Display the codes for the age groups

array([0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3], dtype=int8)

In [27]:
age_categories.codes

array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3], dtype=int8)

In [28]:
age_groups.categories  # Get the categories of the age groups
# Display the categories of the age groups

Index(['Young Adult', 'Adult', 'Middle Aged', 'Senior'], dtype='object')

In [29]:
age_categories.categories  # Get the categories of the age categories
# Display the categories of the age categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [30]:
data4 = np.random.uniform(0, 100, size=(10, 3))
# Create a DataFrame with the random data
df = pd.DataFrame(data4, columns=["A", "B", "C"])
# Display the DataFrame
display(df)

Unnamed: 0,A,B,C
0,28.794563,95.283977,25.524118
1,43.180222,73.659529,81.510719
2,32.989882,82.164253,11.996196
3,87.911817,25.676683,3.993801
4,80.743427,45.721881,43.961226
5,31.427481,42.781779,85.044427
6,73.096804,82.599122,78.883506
7,81.367561,77.660641,99.24723
8,12.500837,97.411251,73.094514
9,60.839252,15.217706,17.837437


In [31]:
# Bin each column of data4 separately since pd.cut requires 1D input
binned_A = pd.cut(data4[:, 0], 4, precision=2)
binned_B = pd.cut(data4[:, 1], 4, precision=2)
binned_C = pd.cut(data4[:, 2], 4, precision=2)

# Display the binned data for each column
print("Binned A:", binned_A)
print("Binned B:", binned_B)
print("Binned C:", binned_C)


Binned A: [(12.43, 31.35], (31.35, 50.21], (31.35, 50.21], (69.06, 87.91], (69.06, 87.91], (31.35, 50.21], (69.06, 87.91], (69.06, 87.91], (12.43, 31.35], (50.21, 69.06]]
Categories (4, interval[float64, right]): [(12.43, 31.35] < (31.35, 50.21] < (50.21, 69.06] < (69.06, 87.91]]
Binned B: [(76.86, 97.41], (56.31, 76.86], (76.86, 97.41], (15.14, 35.77], (35.77, 56.31], (35.77, 56.31], (76.86, 97.41], (76.86, 97.41], (76.86, 97.41], (15.14, 35.77]]
Categories (4, interval[float64, right]): [(15.14, 35.77] < (35.77, 56.31] < (56.31, 76.86] < (76.86, 97.41]]
Binned C: [(3.9, 27.81], (75.43, 99.25], (3.9, 27.81], (3.9, 27.81], (27.81, 51.62], (75.43, 99.25], (75.43, 99.25], (75.43, 99.25], (51.62, 75.43], (3.9, 27.81]]
Categories (4, interval[float64, right]): [(3.9, 27.81] < (27.81, 51.62] < (51.62, 75.43] < (75.43, 99.25]]


In [32]:
data5 = np.random.uniform(size=20)
# Create a DataFrame with the random data
data5  

array([0.53620359, 0.7612815 , 0.36553264, 0.1184362 , 0.93568409,
       0.33386065, 0.56790891, 0.2469016 , 0.29310031, 0.34232427,
       0.86466781, 0.78722346, 0.9973067 , 0.43033999, 0.27442375,
       0.54452803, 0.34622357, 0.03896588, 0.65230988, 0.24804604])

In [33]:
pd.cut(data5,4, precision=2)  # Bin the data into 4 equal-width bins with precision of 2 decimal places
# Display the binned data


[(0.52, 0.76], (0.76, 1.0], (0.28, 0.52], (0.038, 0.28], (0.76, 1.0], ..., (0.52, 0.76], (0.28, 0.52], (0.038, 0.28], (0.52, 0.76], (0.038, 0.28]]
Length: 20
Categories (4, interval[float64, right]): [(0.038, 0.28] < (0.28, 0.52] < (0.52, 0.76] < (0.76, 1.0]]

In [34]:
quantiles = pd.qcut(data5, 4, precision=2)  
# Bin the data into 4 quantiles with precision of 2 decimal places
# Display the quantiles
quantiles.value_counts()  # Count the occurrences in each quantile
# Display the counts of each quantile

(0.028999999999999998, 0.29]    5
(0.29, 0.4]                     5
(0.4, 0.68]                     5
(0.68, 1.0]                     5
Name: count, dtype: int64

In [35]:
display(quantiles)  # Display the quantiles

[(0.4, 0.68], (0.68, 1.0], (0.29, 0.4], (0.028999999999999998, 0.29], (0.68, 1.0], ..., (0.4, 0.68], (0.29, 0.4], (0.028999999999999998, 0.29], (0.4, 0.68], (0.028999999999999998, 0.29]]
Length: 20
Categories (4, interval[float64, right]): [(0.028999999999999998, 0.29] < (0.29, 0.4] < (0.4, 0.68] < (0.68, 1.0]]

In [36]:
pd.qcut(data5, [0, 0.1, 0.5, 0.9, 1.]).value_counts()
# Bin the data into specified quantiles and count occurrences
# Display the counts of each specified quantile

(0.038, 0.234]    2
(0.234, 0.398]    8
(0.398, 0.872]    8
(0.872, 0.997]    2
Name: count, dtype: int64

In [37]:
# Detecting Outliers and Filtering Data

data6 = pd.DataFrame(np.random.standard_normal((1000,4)))
# Create a DataFrame with random data
data6.describe()  # Display the descriptive statistics of the DataFrame

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.0097,-0.010433,0.025964,0.011194
std,0.973137,1.001705,1.034998,0.982915
min,-3.88794,-2.665261,-3.375942,-3.159702
25%,-0.684202,-0.701802,-0.697562,-0.6177
50%,-0.044758,-0.010483,0.045063,0.018792
75%,0.619644,0.646493,0.684903,0.640351
max,2.983733,3.192669,2.914963,3.518045


In [38]:
col = data6[2] # Select the third column of the DataFrame
col.describe()  # Display the descriptive statistics of the selected column

count    1000.000000
mean        0.025964
std         1.034998
min        -3.375942
25%        -0.697562
50%         0.045063
75%         0.684903
max         2.914963
Name: 2, dtype: float64

In [39]:
col[col.abs() > 3]  # Filter the column for values greater than 3 or less than -3
# Display the filtered values

514   -3.375942
Name: 2, dtype: float64

In [40]:
display(data6)

Unnamed: 0,0,1,2,3
0,-0.624098,0.969847,-0.297652,-0.991079
1,-0.114278,1.942817,-0.901983,-1.691155
2,-1.132279,0.947824,-0.789925,0.417358
3,-0.451579,0.184587,-0.494668,0.480168
4,0.137718,2.539802,-0.011821,0.437961
...,...,...,...,...
995,0.189485,1.203030,-0.647424,2.104209
996,0.037015,0.356214,1.209348,-0.593817
997,-0.522243,0.550806,-0.760486,-0.174644
998,-0.787242,0.784016,1.277468,1.313214


In [41]:
data6[(data6.abs() > 3).any(axis="columns")]
# Filter the DataFrame for rows with any value greater than 3 or less than -3

Unnamed: 0,0,1,2,3
137,-0.048671,3.192669,1.047084,0.229457
211,-3.88794,1.578901,-1.065916,-0.068451
336,-3.454854,-0.709079,0.051562,-1.495758
371,-0.229591,1.418579,-2.705674,3.518045
514,-0.046952,1.178558,-3.375942,-0.914484
926,-0.344907,1.254014,1.57566,-3.159702
927,-3.219189,-1.508187,-0.717826,-0.829994


In [42]:
data6[data6.abs() > 3] = np.sign(data6) * 3
# Cap the values in the DataFrame to a maximum of 3 or minimum of -3
data6.describe()  # Display the descriptive statistics of the DataFrame after capping

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.008138,-0.010626,0.02634,0.010836
std,0.96778,1.001107,1.033829,0.980696
min,-3.0,-2.665261,-3.0,-3.0
25%,-0.684202,-0.701802,-0.697562,-0.6177
50%,-0.044758,-0.010483,0.045063,0.018792
75%,0.619644,0.646493,0.684903,0.640351
max,2.983733,3.0,2.914963,3.0


In [43]:
np.sign(data6).head()
# Display the sign of the values in the DataFrame

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,-1.0,1.0,-1.0,-1.0
2,-1.0,1.0,-1.0,1.0
3,-1.0,1.0,-1.0,1.0
4,1.0,1.0,-1.0,1.0


In [44]:
# Permutation and Random Sampling
data7 = pd.DataFrame(np.random.standard_normal((10, 4)),
                     columns=["A", "B", "C", "D"])
# Create a DataFrame with random data
data7.sample(3)  # Randomly sample 3 rows from the DataFrame


Unnamed: 0,A,B,C,D
5,2.3717,-0.993378,-1.674815,1.697113
1,-0.269199,-0.230553,0.835442,-1.188545
2,0.552116,-0.641126,-0.819473,0.226482


In [45]:
# Randomly sample 3 rows from the DataFrame without replacement
data7.sample(3, replace=True)  # Randomly sample 3 rows with replacement

Unnamed: 0,A,B,C,D
1,-0.269199,-0.230553,0.835442,-1.188545
3,0.454439,1.007172,-0.279722,0.504423
3,0.454439,1.007172,-0.279722,0.504423


In [46]:
df = data7.sample(frac=0.5)  # Randomly sample 50% of the DataFrame
# Display the sampled DataFrame
df  

Unnamed: 0,A,B,C,D
4,-0.759838,0.883398,1.103989,0.791358
0,1.548481,-1.86254,2.393509,-0.318013
6,0.586687,0.146661,1.612098,0.70834
9,-0.470299,-0.966295,1.50226,-0.193562
5,2.3717,-0.993378,-1.674815,1.697113


In [47]:
display(data7)

Unnamed: 0,A,B,C,D
0,1.548481,-1.86254,2.393509,-0.318013
1,-0.269199,-0.230553,0.835442,-1.188545
2,0.552116,-0.641126,-0.819473,0.226482
3,0.454439,1.007172,-0.279722,0.504423
4,-0.759838,0.883398,1.103989,0.791358
5,2.3717,-0.993378,-1.674815,1.697113
6,0.586687,0.146661,1.612098,0.70834
7,-1.678689,0.60807,2.026437,-0.606702
8,-0.982142,1.466337,0.455335,-0.080752
9,-0.470299,-0.966295,1.50226,-0.193562


In [48]:
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)),
                   columns=["A", "B", "C", "D", "E", "F", "G"])

df 

Unnamed: 0,A,B,C,D,E,F,G
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [49]:
# Create a random permutation of the indices
sampler = np.random.permutation(len(df))  
sampler

array([1, 2, 4, 0, 3], dtype=int32)

In [50]:
sampler1 = np.random.permutation(df.index)
# Create a random permutation of the DataFrame's index
sampler1

array([0, 3, 1, 2, 4])

In [51]:
sampler2 = np.random.permutation(df.columns)
# Create a random permutation of the DataFrame's columns
sampler2
# Use the random permutation to reorder the DataFrame

array(['E', 'D', 'G', 'A', 'F', 'C', 'B'], dtype=object)

In [52]:
sampler3 = np.random.permutation(df.values)
# Create a random permutation of the DataFrame's values
sampler3
# Use the random permutation to reorder the DataFrame's values

array([[21, 22, 23, 24, 25, 26, 27],
       [ 0,  1,  2,  3,  4,  5,  6],
       [28, 29, 30, 31, 32, 33, 34],
       [ 7,  8,  9, 10, 11, 12, 13],
       [14, 15, 16, 17, 18, 19, 20]])

In [53]:
sampler4 = np.random.permutation(df.values.T)
# Create a random permutation of the transposed DataFrame's values
sampler4
# Use the random permutation to reorder the transposed DataFrame's values


array([[ 6, 13, 20, 27, 34],
       [ 1,  8, 15, 22, 29],
       [ 2,  9, 16, 23, 30],
       [ 3, 10, 17, 24, 31],
       [ 0,  7, 14, 21, 28],
       [ 4, 11, 18, 25, 32],
       [ 5, 12, 19, 26, 33]])

In [54]:
sampler5 = np.random.permutation(5)
# Create a random permutation of the numbers 0 to 4
sampler5

array([1, 3, 4, 0, 2], dtype=int32)

In [55]:
df.take(sampler)  # Take the DataFrame with the permuted indices

Unnamed: 0,A,B,C,D,E,F,G
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27


In [56]:
df.take(sampler1)

Unnamed: 0,A,B,C,D,E,F,G
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
4,28,29,30,31,32,33,34


In [57]:
# sampler3 is a permutation of df.values, not row indices.
# To display it as a DataFrame with the same columns:
pd.DataFrame(sampler3, columns=df.columns)

Unnamed: 0,A,B,C,D,E,F,G
0,21,22,23,24,25,26,27
1,0,1,2,3,4,5,6
2,28,29,30,31,32,33,34
3,7,8,9,10,11,12,13
4,14,15,16,17,18,19,20


In [58]:
# sampler4 is a permutation of df.values.T, not valid row indices for df.take().
# To display sampler4 as a DataFrame with the same columns as df and rows as df.columns:
pd.DataFrame(sampler4, columns=df.index, index=df.columns)
# sampler5 is a permutation of the numbers 0 to 4, not indices of df.

Unnamed: 0,0,1,2,3,4
A,6,13,20,27,34
B,1,8,15,22,29
C,2,9,16,23,30
D,3,10,17,24,31
E,0,7,14,21,28
F,4,11,18,25,32
G,5,12,19,26,33


In [59]:
df.take(sampler5)
# To display sampler5 as a DataFrame with the same columns as df:

Unnamed: 0,A,B,C,D,E,F,G
1,7,8,9,10,11,12,13
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20


In [60]:
df.iloc[sampler] 
# Use iloc to select rows based on the permuted indices

Unnamed: 0,A,B,C,D,E,F,G
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27


In [61]:
# sampler3 is a permutation of df.values, not row indices.
# To display it as a DataFrame with the same columns:
# df.iloc[sampler3] is not valid since sampler3 is not a valid index.
pd.DataFrame(sampler3, columns=df.columns)

Unnamed: 0,A,B,C,D,E,F,G
0,21,22,23,24,25,26,27
1,0,1,2,3,4,5,6
2,28,29,30,31,32,33,34
3,7,8,9,10,11,12,13
4,14,15,16,17,18,19,20


In [62]:
# sampler4 is a permutation of df.values.T, 
# not valid row indices for df.iloc[].

# To display sampler4 as a DataFrame with the same columns as 
# df and rows as df.columns:

# df.iloc[sampler4] is not valid since sampler4 is not a valid index.
pd.DataFrame(sampler4, columns=df.index, index=df.columns)

Unnamed: 0,0,1,2,3,4
A,6,13,20,27,34
B,1,8,15,22,29
C,2,9,16,23,30
D,3,10,17,24,31
E,0,7,14,21,28
F,4,11,18,25,32
G,5,12,19,26,33


In [63]:
df.iloc[sampler5] 
 # Use iloc to select rows based on the permuted indices

Unnamed: 0,A,B,C,D,E,F,G
1,7,8,9,10,11,12,13
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20


In [64]:
column_sampler = np.random.permutation(7)

# Create a random permutation of the numbers 0 to 6
column_sampler 

array([4, 3, 6, 2, 5, 1, 0], dtype=int32)

In [65]:
df.sample(frac=1, axis=1)  
# Randomly sample all columns in the DataFrame


Unnamed: 0,A,C,F,B,G,E,D
0,0,2,5,1,6,4,3
1,7,9,12,8,13,11,10
2,14,16,19,15,20,18,17
3,21,23,26,22,27,25,24
4,28,30,33,29,34,32,31


In [66]:
df.sample(n=3)
# Randomly sample 3 columns from the DataFrame

Unnamed: 0,A,B,C,D,E,F,G
0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20


In [67]:
df.sample(n=3, axis=1)
# Randomly sample 3 columns from the DataFrame

Unnamed: 0,C,A,E
0,2,0,4
1,9,7,11
2,16,14,18
3,23,21,25
4,30,28,32


In [68]:
choices = pd.Series([5, 7, -1, 6, 4])
# Randomly sample 3 values from the Series with replacement
choices.sample(n=3, replace=True)
# Randomly sample 3 values from the Series without replacement

1    7
4    4
2   -1
dtype: int64

In [69]:
choices.sample(n=3, replace=False)
# Randomly sample 3 values from the Series without replacement

2   -1
4    4
3    6
dtype: int64

In [70]:
display(choices) 

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [71]:
choices.sample(n=10, replace=True)
# Randomly sample 10 values from the Series with replacement

2   -1
0    5
3    6
3    6
3    6
0    5
2   -1
2   -1
2   -1
1    7
dtype: int64

In [72]:
choices.sample(n=10, replace=True, random_state=42)
# Randomly sample 10 values from the Series with replacement
# replace=True allows sampling with replacement
# replace=False would raise an error if n > len(choices)
# Randomly sample 10 values from the Series with replacement


3    6
4    4
2   -1
4    4
4    4
1    7
2   -1
2   -1
2   -1
4    4
dtype: int64

In [73]:
# Computing Indicators and Dummy Variables
data8 = pd.DataFrame({"A": [1, 2, 3, 4, 5],
                      "B": [5, 6, 7, 8, 9],
                      "C": [10, 11, 12, 13, 14]})
# Create a DataFrame with random data
data8

Unnamed: 0,A,B,C
0,1,5,10
1,2,6,11
2,3,7,12
3,4,8,13
4,5,9,14


In [74]:
pd.get_dummies(data8, columns=["A", "B"], prefix=["A", "B"])
# Create dummy variables for columns A and B with specified prefixes

Unnamed: 0,C,A_1,A_2,A_3,A_4,A_5,B_5,B_6,B_7,B_8,B_9
0,10,True,False,False,False,False,True,False,False,False,False
1,11,False,True,False,False,False,False,True,False,False,False
2,12,False,False,True,False,False,False,False,True,False,False
3,13,False,False,False,True,False,False,False,False,True,False
4,14,False,False,False,False,True,False,False,False,False,True


In [75]:
pd.get_dummies(data8["A"])
# Create dummy variables for column A

Unnamed: 0,1,2,3,4,5
0,True,False,False,False,False
1,False,True,False,False,False
2,False,False,True,False,False
3,False,False,False,True,False
4,False,False,False,False,True


In [76]:
pd.get_dummies(data8["A"], prefix="A")
# Create dummy variables for column A with specified prefix

Unnamed: 0,A_1,A_2,A_3,A_4,A_5
0,True,False,False,False,False
1,False,True,False,False,False
2,False,False,True,False,False
3,False,False,False,True,False
4,False,False,False,False,True


In [77]:
dummies = pd.get_dummies(data8["A"], prefix="A")
# Create dummy variables for column A with specified prefix
data8_dummies = data8[["A", "B"]].join(dummies)
# Join the dummy variables with the original DataFrame
data8_dummies

Unnamed: 0,A,B,A_1,A_2,A_3,A_4,A_5
0,1,5,True,False,False,False,False
1,2,6,False,True,False,False,False
2,3,7,False,False,True,False,False
3,4,8,False,False,False,True,False
4,5,9,False,False,False,False,True


In [78]:
deaf_tone_bard = pd.DataFrame({"kunci nada": ["b", "b", "a", "c", "a", "b"],
                              "nilai": range(6)})
# Create a DataFrame with random data
deaf_tone_bard 

Unnamed: 0,kunci nada,nilai
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [79]:
dull = pd.get_dummies(deaf_tone_bard["kunci nada"])
# Create dummy variables for the 'kunci nada' column
display(dull)  # Display the dummy variables DataFrame

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [80]:
pd.get_dummies(deaf_tone_bard["kunci nada"], 
               prefix="partitur")

Unnamed: 0,partitur_a,partitur_b,partitur_c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [81]:
pd.get_dummies(deaf_tone_bard["nilai"])

Unnamed: 0,0,1,2,3,4,5
0,True,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,True,False,False
4,False,False,False,False,True,False
5,False,False,False,False,False,True


In [82]:
pd.get_dummies(deaf_tone_bard["nilai"], prefix="nilai")
# Create dummy variables for the 'nilai' column with specified prefix

Unnamed: 0,nilai_0,nilai_1,nilai_2,nilai_3,nilai_4,nilai_5
0,True,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,True,False,False
4,False,False,False,False,True,False
5,False,False,False,False,False,True


In [83]:
with open(r"F:\terrabox download\tmdb_movie_dataset.dat", "r", encoding="utf-8") as dat_file:
    for i, line in enumerate(dat_file):
        if i < 10:
            print(line.strip())  
            # Display the first 10 lines of the file
        else:
            print("the contents file is too long to display")
            break


id | title | vote_average | vote_count | status | release_date | revenue | runtime | adult | backdrop_path | budget | homepage | imdb_id | original_language | original_title | overview | popularity | poster_path | tagline | genres | production_companies | production_countries | spoken_languages | keywords
27205 | Inception | 8.364 | 34495 | Released | 2010-07-15 | 825532764 | 148 | False | /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg | 160000000 | https://www.warnerbros.com/movies/inception | tt1375666 | en | Inception | Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person's idea into a target's subconscious. | 83.952 | /oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg | Your mind is the scene of the crime. | Action, Science Fiction, Adventure | Legendary Pictures, Syncopy, Warner Bros. Pictures | United Kingdom, United States

In [95]:
with open(r"F:\terrabox download\tmdb_movie_dataset.dat", "r", encoding="utf-8") as dat_file:
    for _ in range(10):
        print(dat_file.readline().strip())

id | title | vote_average | vote_count | status | release_date | revenue | runtime | adult | backdrop_path | budget | homepage | imdb_id | original_language | original_title | overview | popularity | poster_path | tagline | genres | production_companies | production_countries | spoken_languages | keywords
27205 | Inception | 8.364 | 34495 | Released | 2010-07-15 | 825532764 | 148 | False | /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg | 160000000 | https://www.warnerbros.com/movies/inception | tt1375666 | en | Inception | Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person's idea into a target's subconscious. | 83.952 | /oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg | Your mind is the scene of the crime. | Action, Science Fiction, Adventure | Legendary Pictures, Syncopy, Warner Bros. Pictures | United Kingdom, United States

In [85]:
import pandas as pd

# Define columns you want to keep
selected_columns = ["id", "title", "genres"]

# Load the dataset and filter columns
movies = pd.read_csv(r"F:\terrabox download\tmdb_movie_dataset.dat", sep="|", 
                     header=None, 
                     names=selected_columns, 
                     engine="python",
                     on_bad_lines='skip')

# Display the first few rows
movies.head(2)  # Display the first two rows of the DataFrame

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,id,title,genres
id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: ""inception"", the implantation of another person's idea into a target's subconscious.",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pictures","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, fran..."


In [96]:
# First, drop duplicates to reduce memory usage
genres_clean = movies["genres"].drop_duplicates()

# Split the genres and get unique genre names
unique_genres = set()
for genre_list in genres_clean:
	for genre in genre_list.split(","):
		genre = genre.strip()
		if genre:
			unique_genres.add(genre)

# Now, create dummies only for these unique genres
def genre_dummies(genre_str):
	genres = [g.strip() for g in genre_str.split(",")]
	return pd.Series({genre: int(genre in genres) for genre in unique_genres})

# Apply to a small sample to avoid memory issues
dummies = movies["genres"].head(1000).apply(genre_dummies)
dummies.head(2)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,biblioteca,on fandor,rush,hardwell,in love with enemy,inquinamento mare,дилдо,production,arabesque,tan zuoren,...,exo planet,perspectiva,russian news about the future,morecambe,magical thinking,paddling,political corruption,outtakes,sbt,talking statue
id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: ""inception"", the implantation of another person's idea into a target's subconscious.",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pictures",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
import csv

# Use the mnames list already defined in your notebook
# Read the DAT file into a DataFrame


cinemas = pd.read_table(
    r"F:\terrabox download\tmdb_movie_dataset.dat",
    sep="|",
    header=None,
    #names=mnames,
    engine="python",
    #quotechar='"',
    #quoting=csv.QUOTE_MINIMAL,
    on_bad_lines='skip'
)
# Display the first few rows of the DataFrame
cinemas.head(2)  # Display the first 2 rows of the DataFrame

# jika file terlalu besar, gunakan head() untuk melihat beberapa baris pertama
# atau gunakan sample() untuk mengambil sampel acak dari DataFrame
# juga on_bad_lines='skip' untuk menghindari error pada baris yang tidak sesuai format

# jika hasil index kolom tidak sesuai, gunakan reset_index(drop=True) untuk mengatur ulang index
# cinemas.reset_index(drop=True, inplace=True)  # Reset index and drop the old index
# atau hasil csv tidak sesuai, gunakan read_csv() dengan parameter yang sesuai  
# file csv masih merupakan data mentah yang perlu dibersihkan
# misalnya, jika ada kolom yang tidak sesuai, gunakan usecols untuk memilih kolom
# atau gunakan skipinitialspace=True untuk mengabaikan spasi awal pada kolom
# atau gunakan dtype untuk menentukan tipe data pada kolom tertentu

# atau gunakan converters untuk mengonversi tipe data pada kolom tertentu
# atau gunakan na_values untuk menentukan nilai yang dianggap NaN
# atau gunakan keep_default_na=False untuk menghindari nilai NaN default

# atau gunakan low_memory=False untuk menghindari pembacaan data dalam potongan kecil
# proses data mentah menggunakan excel atau csv reader dan lakukan pembersihan data



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
1,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate e...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pic...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, fran..."


In [88]:
cinemas.drop(cinemas.columns[[2,3,4,5,7,8,9,11,12,13,14,15,16,17,18,20]], 
             axis="columns")

Unnamed: 0,0,1,6,10,19,21,22,23
0,id,title,revenue,budget,genres,production_countries,spoken_languages,keywords
1,27205,Inception,825532764,160000000,"Action, Science Fiction, Adventure","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, fran..."
2,157336,Interstellar,701729206,165000000,"Adventure, Drama, Science Fiction","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time..."
3,155,The Dark Knight,1004558444,185000000,"Drama, Action, Crime, Thriller","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime ..."
4,19995,Avatar,2923706026,237000000,"Action, Adventure, Fantasy, Science Fiction","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel,..."
...,...,...,...,...,...,...,...,...
1235035,783317,Mabel,0,0,,,,
1235036,783318,Iwo Jima: 36 Days of Hell,0,0,"Documentary, War",,,"world war ii, marine corps, iwo jima"
1235037,783319,Void Weaves,0,0,,,,
1235038,783320,Marriage: Shattered Vows,0,0,Documentary,"Canada, United States of America",English,


In [89]:
# Only drop columns that exist in the DataFrame
# For example, to drop the 'genres' column:
movies.drop(columns=['vote_average', 'vote_count', 'status', 'release_date','adult','backdrop_path',
                     'homepage', 'imdb_id', 'original_language', 'original_title', 'overview',
                     'popularity','poster_path','tagline', 'production_companies'], inplace=True, errors='ignore')
# Display the DataFrame after dropping columns
movies.head(2)  # Display the first 2 rows of the DataFrame after dropping

# somegthing is slack here
# Display the DataFrame after dropping columns
# It will be stored in tmdb_movie_dataset notebook

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,id,title,genres
id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: ""inception"", the implantation of another person's idea into a target's subconscious.",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pictures","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, fran..."


In [90]:
# Reading an Excel file and writing it to a .dat file 
import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_excel(
    r"G:/Engineering & Design Archives/Computing Engineering/Data Science & Engineering Work/Python Grimoire at work/Data Analyst Concepts/Wes McKinney-Python for Data Analyst/Chapter 7 Data Cleaning & preparations/data_set_merge_inquiries.xlsx"
)

# Write the DataFrame to a .dat file with '|' separator
df.to_csv("data_set_merge_inquiries.dat", sep="|", index=False, encoding="utf-8")

# Read the DAT file and display the first few lines
with open("data_set_merge_inquiries.dat", "r", encoding="utf-8") as dat_file:
    for i, line in enumerate(dat_file):
        if i < 10:
            print(line.strip())
        else:
            print("the contents file is too long to display")
            break

CUSTOMER NAME|SALES|ORDER LINE NUMBER|PRICE EACH|QUANTITY ORDER
Amica Models &|8014.82|2|100.0|34
Anna's Decorati|2416.56|4|100.0|24
Atelier graphiqu|3757.26|1|96.34|39
Australian Colle|16126.61|9|396.34000000000003|143
Australian Gift N|13302.83|9|200.0|73
Auto Assoc. & C|10172.7|6|100.0|47
Auto Canal Petit|4708.44|14|100.0|41
Baane Mini Imp|9799.0|16|200.0|67
Cambridge Coll|6463.23|2|100.0|29
the contents file is too long to display


In [106]:
bnames = ["CUSTOMER NAME", "SALES", "ORDER LINE NUMBER","PRICE EACH", "QUANTITY ORDER"]
# Read the DAT file into a DataFrame
business = pd.read_table("data_set_merge_inquiries.dat",
                         sep="|", header=None, names=bnames, engine="python")
# Display the first few rows of the DataFrame
business.head(10)  # Display the first 10 rows of the DataFrame

Unnamed: 0,CUSTOMER NAME,SALES,ORDER LINE NUMBER,PRICE EACH,QUANTITY ORDER
0,CUSTOMER NAME,SALES,ORDER LINE NUMBER,PRICE EACH,QUANTITY ORDER
1,Amica Models &,8014.82,2,100.0,34
2,Anna's Decorati,2416.56,4,100.0,24
3,Atelier graphiqu,3757.26,1,96.34,39
4,Australian Colle,16126.61,9,396.34000000000003,143
5,Australian Gift N,13302.83,9,200.0,73
6,Auto Assoc. & C,10172.7,6,100.0,47
7,Auto Canal Petit,4708.44,14,100.0,41
8,Baane Mini Imp,9799.0,16,200.0,67
9,Cambridge Coll,6463.23,2,100.0,29


In [107]:
business_dummies= business["CUSTOMER NAME"].str.get_dummies(sep=",")
# Create dummy variables for the 'CUSTOMER NAME' column
business_dummies.head(10)  # Display the first 10 rows of the dummy variables DataFrame

Unnamed: 0,Amica Models &,Anna's Decorati,Atelier graphiqu,Australian Colle,Australian Gift N,Auto Assoc. & C,Auto Canal Petit,Baane Mini Imp,CUSTOMER NAME,Cambridge Coll,...,Super Scale Inc,Technics Stores,Tekni Collectabl,Tokyo Collectabl,Toys of Finland,Toys4GrownUp,UK Collectables,Vitachrome Inc.,Volvo Model Re,West Coast Coll
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [97]:
dummies.iloc[:10, :6]  # Display the first 10 rows and first 6 columns of the dummy variables DataFrame
# Create dummy variables for the 'CUSTOMER NAME' column with a maximum of 6 columns 

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,biblioteca,on fandor,rush,hardwell,in love with enemy,inquinamento mare
id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,0,0,0,0,0,0
27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: ""inception"", the implantation of another person's idea into a target's subconscious.",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pictures",0,0,0,0,0,0
157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,165000000,http://www.interstellarmovie.net/,tt0816692,en,Interstellar,The adventures of a group of explorers who make use of a newly discovered wormhole to surpass the limitations on human space travel and conquer the vast distances involved in an interstellar voyage.,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant to die here.,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Productions",0,0,0,0,0,0
155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,185000000,https://www.warnerbros.com/movies/dark-knight/,tt0468569,en,The Dark Knight,"Batman raises the stakes in his war on crime. With the help of Lt. Jim Gordon and District Attorney Harvey Dent, Batman sets out to dismantle the remaining criminal organizations that plague the streets. The partnership proves to be effective, but they soon find themselves prey to a reign of chaos unleashed by a rising criminal mastermind known to the terrified citizens of Gotham as the Joker.",130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel Griffiths, Warner Bros. Pictures",0,0,0,0,0,0
19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,237000000,https://www.avatar.com/movies/avatar,tt0499549,en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, 20th Century Fox, Ingenious Media",0,0,0,0,0,0
24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,220000000,https://www.marvel.com/movies/the-avengers,tt0848228,en,The Avengers,"When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!",98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,0,0,0,0,0,0
293660,Deadpool,7.606,28894,Released,2016-02-09,783100000,108,False,/en971MEXui9diirXlogOrPKmsEn.jpg,58000000,https://www.20thcenturystudios.com/movies/deadpool,tt1431045,en,Deadpool,"The origin story of former Special Forces operative turned mercenary Wade Wilson, who, after being subjected to a rogue experiment that leaves him with accelerated healing powers, adopts the alter ego Deadpool. Armed with his new abilities and a dark, twisted sense of humor, Deadpool hunts down the man who nearly destroyed his life.",72.735,/zq8Cl3PNIDGU3iWNRoc5nEZ6pCe.jpg,Witness the beginning of a happy ending.,"Action, Adventure, Comedy","20th Century Fox, The Donners' Company, Genre Films",0,0,0,0,0,0
299536,Avengers: Infinity War,8.255,27713,Released,2018-04-25,2052415039,149,False,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,300000000,https://www.marvel.com/movies/avengers-infinity-war,tt4154756,en,Avengers: Infinity War,"As the Avengers and their allies have continued to protect the world from threats too large for any one hero to handle, a new danger has emerged from the cosmic shadows: Thanos. A despot of intergalactic infamy, his goal is to collect all six Infinity Stones, artifacts of unimaginable power, and use them to inflict his twisted will on all of reality. Everything the Avengers have fought for has led up to this moment - the fate of Earth and existence itself has never been more uncertain.",154.34,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,An entire universe. Once and for all.,"Adventure, Action, Science Fiction",Marvel Studios,0,0,0,0,0,0
550,Fight Club,8.438,27238,Released,1999-10-15,100853753,139,False,/hZkgoQYus5vegHoetLkCJzb17zJ.jpg,63000000,http://www.foxmovies.com/movies/fight-club,tt0137523,en,Fight Club,"A ticking-time-bomb insomniac and a slippery soap salesman channel primal male aggression into a shocking new form of therapy. Their concept catches on, with underground ""fight clubs"" forming in every town, until an eccentric gets in the way and ignites an out-of-control spiral toward oblivion.",69.498,/pB8BM7pdSp6B6Ih7QZ4DrQ3PmJK.jpg,Mischief. Mayhem. Soap.,Drama,"Regency Enterprises, Fox 2000 Pictures, Taurus Film, Atman Entertainment, Knickerbocker Films, The Linson Company, 20th Century Fox",0,0,0,0,0,0
118340,Guardians of the Galaxy,7.906,26638,Released,2014-07-30,772776600,121,False,/uLtVbjvS1O7gXL8lUOwsFOH4man.jpg,170000000,http://marvel.com/guardians,tt2015381,en,Guardians of the Galaxy,"Light years from Earth, 26 years after being abducted, Peter Quill finds himself the prime target of a manhunt after discovering an orb wanted by Ronan the Accuser.",33.255,/r7vmZjiyZw9rpJMQJdXpjgiCOk9.jpg,All heroes start somewhere.,"Action, Science Fiction, Adventure",Marvel Studios,0,0,0,0,0,0


In [None]:
# Drop the second column (by position) from the DataFrame
business.drop(business.columns[2], axis="columns")


# Display the DataFrame after dropping the second column
business.head(10)  # Display the first 10 rows of the DataFrame after dropping

Unnamed: 0,CUSTOMER NAME,SALES,ORDER LINE NUMBER,PRICE EACH,QUANTITY ORDER
0,CUSTOMER NAME,SALES,ORDER LINE NUMBER,PRICE EACH,QUANTITY ORDER
1,Amica Models &,8014.82,2,100.0,34
2,Anna's Decorati,2416.56,4,100.0,24
3,Atelier graphiqu,3757.26,1,96.34,39
4,Australian Colle,16126.61,9,396.34000000000003,143
5,Australian Gift N,13302.83,9,200.0,73
6,Auto Assoc. & C,10172.7,6,100.0,47
7,Auto Canal Petit,4708.44,14,100.0,41
8,Baane Mini Imp,9799.0,16,200.0,67
9,Cambridge Coll,6463.23,2,100.0,29


In [101]:
business[10:20]  # Display rows from index 11 to 19 (exclusive) of the DataFrame

Unnamed: 0,CUSTOMER NAME,SALES,ORDER LINE NUMBER,PRICE EACH,QUANTITY ORDER
10,Canadian Gift E,9064.89,6,100.0,47
11,Classic Gift Idea,5372.57,8,100.0,23
12,Classic Legend,4860.24,1,100.0,21
13,Collectables For,4514.92,2,100.0,41
14,Corporate Gift I,18476.9,34,300.0,122
15,Corrida Auto Re,7329.06,11,100.0,38
16,Daedalus Desig,7193.370000000001,21,186.13,71
17,Diecast Classic,9924.54,5,194.74,91
18,Dragon Souveni,10993.5,8,100.0,45
19,Euro Shopping,19517.08,8,200.0,116


In [102]:
business[20:30]  
# Display rows from index 21 to 29 (exclusive) of the DataFrame

Unnamed: 0,CUSTOMER NAME,SALES,ORDER LINE NUMBER,PRICE EACH,QUANTITY ORDER
20,FunGiftIdeas.co,5432.62,9,120.87,92
21,Gift Depot Inc.,8148.76,7,200.0,74
22,Herkku Gifts,9173.24,5,196.34,86
23,La Rochelle Gift,7814.9,17,227.74,88
24,Land of Toys Inc,14339.63,10,395.61,131
25,Lyon Souvenier,7192.110000000001,7,194.74,68
26,Marta's Replica,10584.27,15,200.0,83
27,Mini Classics,4043.96,2,100.0,34
28,Mini Gifts Distrib,17315.54,13,200.0,80
29,Mini Wheels Co.,8197.46,10,298.57,72


In [118]:
np.random.seed(12345)  # Set a random seed for reproducibility
# Create a DataFrame with random data
values = np.random.uniform(size=10)
# Display the random values
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [119]:
bins_himpunan2 = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
# Define the bin edges
pd.get_dummies(pd.cut(values, bins=bins_himpunan2, precision=2))
# Create dummy variables for the binned values with specified precision

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,False,False,False,False,True
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,False,True,False,False
5,False,False,True,False,False
6,False,False,False,False,True
7,False,False,False,True,False
8,False,False,False,True,False
9,False,False,False,True,False
