In [5]:
import pandas as pd
import os
import chardet

# Detect encoding for CSV files
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(10000))  # First 10KB is usually enough
        return result['encoding']
    

# Find the header row containing expected columns
def find_header_row(filepath, is_excel=False, encoding='utf-8'):
    rows_to_check = 10
    if is_excel:
        preview = pd.read_excel(filepath, header=None, nrows=rows_to_check, engine='openpyxl')
    else:
        preview = pd.read_csv(filepath, header=None, nrows=rows_to_check, encoding=encoding)

    for i in range(rows_to_check):
        row = preview.iloc[i].astype(str).str.lower()
        if {'First name', 'Last name', 'Company'}.issubset(set(row)):
            return i
    return 0  # fallback if not found


directory = r'LinkedIn Data Public'
files = os.listdir(directory)

for file in files:
    file_path = os.path.join(directory, file)

    try:
        # Process Excel files
        if file.endswith('.xlsx'):
            header_row = find_header_row(file_path, is_excel=True)
            df = pd.read_excel(file_path, engine='openpyxl', header=header_row)

            df = df[['First Name', 'Last Name', 'Company']]
            df['Company'] = df['Company'].fillna("None")
            df = df.dropna(subset=['First Name', 'Last Name'])

            # Save as CSV
            new_csv_name = file.replace('.xlsx', '.csv')
            new_csv_path = os.path.join(directory, new_csv_name)
            df.to_csv(new_csv_path, index=False)

        # Process CSV files
        elif file.endswith('.csv'):
            try:
                # Try utf-8 first
                encoding = 'utf-8'
                header_row = find_header_row(file_path, is_excel=False, encoding=encoding)
                df = pd.read_csv(file_path, encoding=encoding, header=header_row)
                
            except UnicodeDecodeError:
                # Fallback to ISO-8859-1 if utf-8 fails
                encoding = 'ISO-8859-1'
                header_row = find_header_row(file_path, is_excel=False, encoding=encoding)
                df = pd.read_csv(file_path, encoding=encoding, header=header_row)

            df = df[['First Name', 'Last Name', 'Company']]
            df['Company'] = df['Company'].fillna("None")
            df = df.dropna(subset=['First Name', 'Last Name'])

            # Save cleaned data
            df.to_csv(file_path, index=False)
            print("file processed : ",file)
        else:
            print(file)

    except Exception as e:
        print(f"Error processing {file}: {e}")


.DS_Store
file processed :  Aaditya_Raj - Aaditya Raj.csv
file processed :  Abhishek_Singh - Abhishek Singh.csv
file processed :  Aditya_Singh - Aditya NO-LASTNAME.csv
file processed :  Afzal_Raza - Afzl Raza.csv
file processed :  Ajay Jatav Connections-1 - Ajay Jatav.csv
file processed :  Ajit_Yadav - Ajit Yadav.csv
file processed :  Akanksha_Kushwaha - Akanksha.csv
file processed :  Alok_raj - Alok Raj.csv
file processed :  Aman_ Adarsh.csv
file processed :  Aman_Singh - Aman Singh.csv
file processed :  amit_kumar - Amit Kumar.csv
file processed :  Anamika_Kumari - Anamika Kumari.csv
file processed :  Anand_Pandey - Anand Pandey.csv
file processed :  Anoop_Kumar - ANOOP KUMAR.csv
file processed :  Anshu_Kumar - Anshu Kumar.csv
file processed :  Anuradha_Tiwari - Anuradha Tiwari.csv
file processed :  Anushri_Mishra - Anushri Mishra.csv
file processed :  Aradhya_Patel - Aradhya Patel.csv
file processed :  Arjun Kadam - Arjun Kadam.csv
file processed :  Arpita_Tripathi - Arpita Tripathi

In [None]:
import os
import hashlib
from collections import defaultdict

data_dir = r'LinkedIn Data Public'
hash_map = defaultdict(list)

def get_file_hash(filepath):
    hasher = hashlib.md5()
    with open(filepath, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

# Scan all CSVs and group by hash
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
for file in csv_files:
    path = os.path.join(data_dir, file)
    file_hash = get_file_hash(path)
    hash_map[file_hash].append(file)

# Delete duplicates, keeping one file per group
deleted_files = []

for file_list in hash_map.values():
    if len(file_list) > 1:
        # Keep the first file, delete the rest
        for dup_file in file_list[1:]:
            dup_path = os.path.join(data_dir, dup_file)
            os.remove(dup_path)
            deleted_files.append(dup_file)
if len(deleted_files) == 0:
    print("No duplicate files in folder")
    
print(f"üóëÔ∏è Deleted {len(deleted_files)} duplicate files:")
for f in deleted_files:
    print(f"  - {f}")

üóëÔ∏è Deleted 2 duplicate files:
  - Sneha_Shaw.csv
  - Samina_Sultana.csv
