In [1]:
import pandas as pd, numpy as np

## intersections 

In [2]:
def align_dtypes(df1, df2):
    for col in df1.columns.intersection(df2.columns):
        df2[col] = df2[col].astype(df1[col].dtype)
    return df1, df2

In [25]:
df1 = pd.DataFrame({'x': [True] * 5})
df2 = pd.DataFrame({'x': ["True"]*5})
# align_dtypes(df, df2)

In [27]:
df2.dtypes

x    object
dtype: object

## `in` with Series 

In [9]:
s = pd.Series(range(10,20)); s

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In [14]:
11 in s, 11 in list(s)

(False, True)

## df equality 

In [15]:
import pandas as pd 
import numpy as np 

In [16]:
def make_dfs(*args):
    np.random.seed(0)
    df1 = pd.DataFrame(np.random.rand(*args))
    np.random.seed(0)
    df2 = pd.DataFrame(np.random.rand(*args))
    return df1, df2 
df1, df2 = make_dfs(10,1)

In [17]:
df1.head()

Unnamed: 0,0
0,0.548814
1,0.715189
2,0.602763
3,0.544883
4,0.423655


In [18]:
df2.head()

Unnamed: 0,0
0,0.548814
1,0.715189
2,0.602763
3,0.544883
4,0.423655


In [20]:
df1==df2

Unnamed: 0,0
0,True
1,True
2,True
3,True
4,True
5,True
6,True
7,True
8,True
9,True


In [21]:
(df1 == df2).all()

0    True
dtype: bool

In [22]:
(df1 == df2).all().all()

True

# identical columns 

In [7]:
df=pd.DataFrame(np.random.rand(3,2), columns=['yes','yes']); df

Unnamed: 0,yes,yes.1
0,0.697827,0.756066
1,0.48306,0.834582
2,0.611664,0.001328


In [8]:
df['yes']

Unnamed: 0,yes,yes.1
0,0.697827,0.756066
1,0.48306,0.834582
2,0.611664,0.001328


In [17]:
df.loc[:,['yes']]

Unnamed: 0,yes,yes.1
0,0.697827,0.756066
1,0.48306,0.834582
2,0.611664,0.001328


# quartile binning 

In [18]:
s = pd.Series(np.random.rand(100)); s

0     0.108511
1     0.221347
2     0.239827
3     0.633540
4     0.648709
        ...   
95    0.593336
96    0.849349
97    0.407497
98    0.225728
99    0.707092
Length: 100, dtype: float64

In [22]:
df = pd.DataFrame([s, pd.qcut(s, [0,0.25,0.5,0.75,1.])]).T
df

Unnamed: 0,0,1
0,0.108511,"(0.00893, 0.321]"
1,0.221347,"(0.00893, 0.321]"
2,0.239827,"(0.00893, 0.321]"
3,0.63354,"(0.535, 0.779]"
4,0.648709,"(0.535, 0.779]"
...,...,...
95,0.593336,"(0.535, 0.779]"
96,0.849349,"(0.779, 0.988]"
97,0.407497,"(0.321, 0.535]"
98,0.225728,"(0.00893, 0.321]"


In [19]:
pd.qcut(s, [0,0.25,0.5,0.75,1.]).value_counts()

(0.00893, 0.321]    25
(0.321, 0.535]      25
(0.535, 0.779]      25
(0.779, 0.988]      25
Name: count, dtype: int64

## fragmentation

In [1]:
import pandas as pd
import numpy as np

# Problem demonstration
def demonstrate_fragmentation():
    df = pd.DataFrame({'A': range(1000)})
    
    # Bad practice: Adding columns one by one
    for i in range(10):
        df[f'Col_{i}'] = np.random.rand(1000)
    
    return df

# Solution 1: Pre-allocate all columns
def solution_preallocate():
    # Good practice: Create all columns at once
    data = {
        'A': range(1000),
        **{f'Col_{i}': np.random.rand(1000) for i in range(10)}
    }
    df = pd.DataFrame(data)
    
    return df

# Solution 2: Use pd.concat when adding multiple columns
def solution_concat():
    base_df = pd.DataFrame({'A': range(1000)})
    
    # Create new columns as separate DataFrames
    new_columns = pd.DataFrame(
        {f'Col_{i}': np.random.rand(1000) for i in range(10)}
    )
    
    # Concatenate horizontally
    df = pd.concat([base_df, new_columns], axis=1)
    
    return df

# Solution 3: If you must add columns iteratively, use copy() periodically
def solution_copy():
    df = pd.DataFrame({'A': range(1000)})
    
    for i in range(10):
        df[f'Col_{i}'] = np.random.rand(1000)
        
        # Defragment every 5 iterations
        if (i + 1) % 5 == 0:
            df = df.copy()
    
    return df

# Example usage and timing comparison
def compare_solutions():
    import time
    
    times = {}
    
    for solution in [demonstrate_fragmentation, solution_preallocate, 
                     solution_concat, solution_copy]:
        start = time.time()
        result = solution()
        end = time.time()
        times[solution.__name__] = end - start
    
    return times

In [2]:
demonstrate_fragmentation()

Unnamed: 0,A,Col_0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9
0,0,0.970502,0.957925,0.468427,0.079532,0.789385,0.687192,0.830216,0.456026,0.122476,0.595701
1,1,0.007434,0.037666,0.899013,0.855425,0.836074,0.131365,0.813113,0.630567,0.929076,0.302767
2,2,0.454594,0.104954,0.795225,0.395363,0.808340,0.933567,0.802941,0.102839,0.731712,0.675362
3,3,0.040081,0.814845,0.632997,0.243769,0.665876,0.230559,0.892082,0.266661,0.104501,0.549814
4,4,0.166994,0.414708,0.536812,0.527826,0.561965,0.667315,0.054001,0.969416,0.793741,0.580959
...,...,...,...,...,...,...,...,...,...,...,...
995,995,0.732915,0.200761,0.046573,0.863786,0.338276,0.182731,0.909336,0.862529,0.766403,0.339065
996,996,0.631229,0.356425,0.856700,0.087686,0.643526,0.008038,0.651513,0.972222,0.528491,0.887493
997,997,0.034394,0.516819,0.903808,0.705808,0.251930,0.245246,0.054597,0.758996,0.516045,0.063159
998,998,0.316110,0.651243,0.669695,0.320786,0.111080,0.494766,0.167630,0.586632,0.162270,0.920025
