In [7]:
import pandas as pd

# Read both batch files
batch1 = pd.read_csv('batch_0_128.csv')
batch2 = pd.read_csv('batch_128_2944.csv')

# Replace first 128 rows in batch2 with batch1
batch2.iloc[0:128] = batch1.iloc[0:128]
batch = batch2

print("Successfully merged batches by replacing first 128 rows")


Successfully merged batches by replacing first 128 rows


In [10]:
import pandas as pd

df = batch
main = pd.read_csv('../input.csv')

# Print info about both DataFrames
print("Main DataFrame:")
print(main.columns.tolist())
print(f"Main shape: {main.shape}")

print("\nBatch DataFrame:")
print(df.columns.tolist())
print(f"Batch shape: {df.shape}")

# Check for any differences in column names (case sensitivity, spaces, etc)
print("\nComparing 'company_name' values:")
print(f"Main unique values: {main['company_name'].nunique()}")
print(f"Batch unique values: {df['company_name'].nunique()}")

# Clean and merge
def clean_merge():
    # Clean the columns (remove spaces, lowercase)
    for frame in [main, df]:
        frame.columns = frame.columns.str.strip().str.lower()
        
        # Clean the matching columns if needed
        frame['company_name'] = frame['company_name'].str.strip()
        frame['short_description'] = frame['short_description'].str.strip()
    
    # Merge with verbose output
    merged_df = pd.merge(
        main, df,
        on=['company_name', 'short_description'],
        how='left',
        suffixes=('', '_y'),
        indicator=True  # This will show which rows matched
    )
    
    # Show merge results
    print("\nMerge Results:")
    print(merged_df['_merge'].value_counts())
    
    # Drop duplicate columns
    columns_to_drop = [col for col in merged_df.columns if col.endswith('_y') or col == '_merge']
    merged_df = merged_df.drop(columns=columns_to_drop)
    
    # Add new columns from df
    df_unique_cols = [col for col in df.columns if col not in main.columns]
    final_df = merged_df[main.columns.tolist() + df_unique_cols]
    
    return final_df

# Execute the merge
main = clean_merge()

# Verify results
print("\nFinal DataFrame:")
print(f"Shape: {main.shape}")
print("Columns:", main.columns.tolist())

# Save the result
# main.to_csv('merged_output.csv', index=False)

Main DataFrame:
['Unnamed: 0', 'company_id', 'company_name', 'short_description', 'long_description', 'batch', 'status', 'tags', 'location', 'country', 'year_founded', 'num_founders', 'founders_names', 'team_size', 'website', 'cb_url', 'linkedin_url']
Main shape: (5024, 17)

Batch DataFrame:
['unnamed: 0', 'company_name', 'short_description', 'website', 'direct_mentions_count', 'direct_mentions', 'all_search_results']
Batch shape: (5024, 7)

Comparing 'company_name' values:
Main unique values: 4959
Batch unique values: 4959

Merge Results:
_merge
both          5024
left_only        0
right_only       0
Name: count, dtype: int64

Final DataFrame:
Shape: (5024, 20)
Columns: ['unnamed: 0', 'company_id', 'company_name', 'short_description', 'long_description', 'batch', 'status', 'tags', 'location', 'country', 'year_founded', 'num_founders', 'founders_names', 'team_size', 'website', 'cb_url', 'linkedin_url', 'direct_mentions_count', 'direct_mentions', 'all_search_results']


In [11]:
main.head()

Unnamed: 0,unnamed: 0,company_id,company_name,short_description,long_description,batch,status,tags,location,country,year_founded,num_founders,founders_names,team_size,website,cb_url,linkedin_url,direct_mentions_count,direct_mentions,all_search_results
0,0,30204,Subtrace,Chrome DevTools for your backend,We make it easy to watch everything happening ...,W25,Active,"['developer-tools', 'infrastructure']",,,2024.0,2,"['Adhityaa Chandrasekar', 'Sachin Sridhar']",2.0,https://subtrace.dev,,https://www.linkedin.com/company/subtrace,0.0,[],[SearchResult(url=https://www.reddit.com/r/gam...
1,1,30208,Hyphen,Medical Data Annotation Software,Hyphen enables Medical AI research teams and d...,W25,Active,"['aiops', 'machine-learning', 'crowdsourcing',...",San Francisco,US,2024.0,2,"['Vishruth N', 'Nikhil Tiwari']",2.0,https://www.hyphenlabs.tech,,,0.0,[],[]
2,2,30114,Ryvn,"Deploy, update, and monitor applications in cu...",Ryvn manages the infrastructure for your appli...,F24,Active,"['aiops', 'developer-tools', 'saas', 'devops',...",San Francisco,US,2024.0,2,"['Albert Lam', 'Shardool Patel']",2.0,https://ryvn.ai,,https://www.linkedin.com/company/ryvn/,0.0,[],[SearchResult(url=https://www.reddit.com/r/NBA...
3,3,30099,Gander,"LLMs For Aviation, Starting with Customer Service",Gander is bringing LLMs to the airline industr...,F24,Active,"['artificial-intelligence', 'saas', 'customer-...",New York,US,2024.0,2,"['Arjan Guglani', 'Andrew Dixon']",3.0,https://usegander.com,,https://www.linkedin.com/company/usegander,0.0,[],[]
4,4,30097,Innate,Teachable general-purpose robots,Innate is developing teachable home robots.\r\...,F24,Active,"['generative-ai', 'hard-tech', 'robotics', 'co...",Palo Alto,US,2024.0,2,"['Axel Peytavin', 'Vignesh Anand']",2.0,https://innate.bot,,https://www.linkedin.com/company/innate-bot,0.0,[],[SearchResult(url=https://www.reddit.com/r/dis...


In [13]:
# Check unique values and their frequencies in the 'batch' column. Taking only the most recent batches until 2019
print("\nUnique values in 'batch' column:")
print(main[main['batch'].isin(['W25', 'F24', 'S24', 'W24', 'S23', 'W23', 'S22', 'W22', 'S21', 'W21', 'S20', 'W20', 'S19', 'W19'])]['batch'].value_counts())



Unique values in 'batch' column:
batch
W22    401
S21    392
W21    337
W23    278
S24    257
W24    251
S22    236
W20    228
S23    221
S20    208
W19    195
S19    175
F24     95
W25      2
Name: count, dtype: int64


In [14]:
# Filter for rows with batches from W25 to W19
recent_batches = ['W25', 'F24', 'S24', 'W24', 'S23', 'W23', 'S22', 'W22', 'S21', 'W21', 'S20', 'W20', 'S19', 'W19']
filtered_df = main[main['batch'].isin(recent_batches)]

print("\nFiltered DataFrame Shape:", filtered_df.shape)
filtered_df.head()



Filtered DataFrame Shape: (3276, 20)


Unnamed: 0,unnamed: 0,company_id,company_name,short_description,long_description,batch,status,tags,location,country,year_founded,num_founders,founders_names,team_size,website,cb_url,linkedin_url,direct_mentions_count,direct_mentions,all_search_results
0,0,30204,Subtrace,Chrome DevTools for your backend,We make it easy to watch everything happening ...,W25,Active,"['developer-tools', 'infrastructure']",,,2024.0,2,"['Adhityaa Chandrasekar', 'Sachin Sridhar']",2.0,https://subtrace.dev,,https://www.linkedin.com/company/subtrace,0.0,[],[SearchResult(url=https://www.reddit.com/r/gam...
1,1,30208,Hyphen,Medical Data Annotation Software,Hyphen enables Medical AI research teams and d...,W25,Active,"['aiops', 'machine-learning', 'crowdsourcing',...",San Francisco,US,2024.0,2,"['Vishruth N', 'Nikhil Tiwari']",2.0,https://www.hyphenlabs.tech,,,0.0,[],[]
2,2,30114,Ryvn,"Deploy, update, and monitor applications in cu...",Ryvn manages the infrastructure for your appli...,F24,Active,"['aiops', 'developer-tools', 'saas', 'devops',...",San Francisco,US,2024.0,2,"['Albert Lam', 'Shardool Patel']",2.0,https://ryvn.ai,,https://www.linkedin.com/company/ryvn/,0.0,[],[SearchResult(url=https://www.reddit.com/r/NBA...
3,3,30099,Gander,"LLMs For Aviation, Starting with Customer Service",Gander is bringing LLMs to the airline industr...,F24,Active,"['artificial-intelligence', 'saas', 'customer-...",New York,US,2024.0,2,"['Arjan Guglani', 'Andrew Dixon']",3.0,https://usegander.com,,https://www.linkedin.com/company/usegander,0.0,[],[]
4,4,30097,Innate,Teachable general-purpose robots,Innate is developing teachable home robots.\r\...,F24,Active,"['generative-ai', 'hard-tech', 'robotics', 'co...",Palo Alto,US,2024.0,2,"['Axel Peytavin', 'Vignesh Anand']",2.0,https://innate.bot,,https://www.linkedin.com/company/innate-bot,0.0,[],[SearchResult(url=https://www.reddit.com/r/dis...


In [18]:
# Sort filtered_df by direct_mentions_count in descending order
sorted_df = filtered_df.sort_values('direct_mentions_count', ascending=False)

print("\nCompanies sorted by direct mentions (highest to lowest):")
print(sorted_df[['company_name', 'direct_mentions_count', 'batch']].head(10))

sorted_df.to_csv('sorted_most_mentions.csv')



Companies sorted by direct mentions (highest to lowest):
     company_name  direct_mentions_count batch
648   Dioxus Labs                   50.0   S23
946          iNRI                   50.0   W23
2406        Cyble                   50.0   W21
1331  NewsCatcher                   50.0   S22
670          Pure                   50.0   S23
1329        Phind                   50.0   S22
1324      Payload                   50.0   S22
2800    Jet Admin                   50.0   W20
2797      Whatnot                   50.0   W20
1824       Algofi                   50.0   S21
