In [3]:
import pandas as pd

# Load the CSV files
df_512 = pd.read_csv("final_test_512.csv")
df_1024 = pd.read_csv("final_test_1024.csv")
df_2048 = pd.read_csv("final_test_2048.csv")

# Merge all three files on 'ID' for comparison
merged_df = df_512.merge(df_1024, on='ID', suffixes=('_512', '_1024'))
merged_df = merged_df.merge(df_2048, on='ID')
merged_df = merged_df.rename(columns={'predicted_label': 'predicted_label_2048'})

# Find rows where predictions differ
mismatched = merged_df[
    (merged_df['predicted_label_512'] != merged_df['predicted_label_1024']) |
    (merged_df['predicted_label_512'] != merged_df['predicted_label_2048']) |
    (merged_df['predicted_label_1024'] != merged_df['predicted_label_2048'])
]

print(mismatched)  # Show mismatched entries

# Save mismatched rows to CSV
mismatched.to_csv("mismatched_predictions.csv", index=False)


        ID  predicted_label_512  predicted_label_1024  predicted_label_2048
10      66                    1                     0                     0
16     110                    0                     1                     0
18     113                    0                     1                     0
26     172                    1                     1                     0
31     205                    1                     0                     0
...    ...                  ...                   ...                   ...
1563  7816                    0                     1                     0
1567  7843                    1                     0                     0
1568  7847                    1                     1                     0
1569  7850                    1                     0                     0
1573  7871                    0                     1                     0

[246 rows x 4 columns]


In [7]:
import pandas as pd

# Load the CSV files
df_512 = pd.read_csv("final_test_512.csv")
df_1024 = pd.read_csv("final_test_1024.csv")
df_2048 = pd.read_csv("final_test_2048.csv")
df_final = pd.read_csv("final_test_predictions.csv")

# Merge all four files on 'ID'
merged_df = df_512.merge(df_1024, on='ID', suffixes=('_512', '_1024'))
merged_df = merged_df.merge(df_2048, on='ID')
merged_df = merged_df.rename(columns={'predicted_label': 'predicted_label_2048'})
merged_df = merged_df.merge(df_final, on='ID')
merged_df = merged_df.rename(columns={'predicted_label': 'predicted_label_final'})

# Find mismatched predictions
mismatched = merged_df[
    (merged_df['predicted_label_512'] != merged_df['predicted_label_1024']) |
    (merged_df['predicted_label_512'] != merged_df['predicted_label_2048']) |
    (merged_df['predicted_label_512'] != merged_df['predicted_label_final']) |
    (merged_df['predicted_label_1024'] != merged_df['predicted_label_2048']) |
    (merged_df['predicted_label_1024'] != merged_df['predicted_label_final']) |
    (merged_df['predicted_label_2048'] != merged_df['predicted_label_final'])
]

# Compute majority label and count
def get_majority_info(row):
    labels = [
        row['predicted_label_512'],
        row['predicted_label_1024'],
        row['predicted_label_2048'],
        row['predicted_label_final']
    ]
    majority_label = max(set(labels), key=labels.count)
    majority_count = labels.count(majority_label)
    return pd.Series([majority_label, majority_count], index=['majority_label', 'majority_count'])

# Apply the function
mismatched[['majority_label', 'majority_count']] = mismatched.apply(get_majority_info, axis=1)

# Save the result
mismatched.to_csv("mismatched_predictions_with_majority_all4.csv", index=False)

print(mismatched)


        ID  predicted_label_512  predicted_label_1024  predicted_label_2048  \
8       58                    1                     1                     1   
10      66                    1                     0                     0   
16     110                    0                     1                     0   
18     113                    0                     1                     0   
24     161                    1                     1                     1   
...    ...                  ...                   ...                   ...   
1563  7816                    0                     1                     0   
1567  7843                    1                     0                     0   
1568  7847                    1                     1                     0   
1569  7850                    1                     0                     0   
1573  7871                    0                     1                     0   

      predicted_label_final  majority_label  majori

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatched[['majority_label', 'majority_count']] = mismatched.apply(get_majority_info, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatched[['majority_label', 'majority_count']] = mismatched.apply(get_majority_info, axis=1)


In [8]:
import pandas as pd

# Load the two CSV files
df_1024 = pd.read_csv("final_test_1024.csv")
df_2 = pd.read_csv("final_2.csv")

# Merge both on 'ID'
merged_df = df_1024.merge(df_2, on='ID', suffixes=('_1024', '_2'))

# Find mismatched predictions
mismatched = merged_df[
    merged_df['predicted_label_1024'] != merged_df['predicted_label_2']
]

# Show the mismatches
print(mismatched)

# Save to CSV
mismatched.to_csv("mismatched_1024_vs_2.csv", index=False)


        ID  predicted_label_1024  predicted_label_2
18     113                     1                  0
19     117                     0                  1
87     440                     1                  0
113    565                     1                  0
144    734                     1                  0
194    957                     1                  0
206   1015                     1                  0
248   1204                     1                  0
258   1258                     0                  1
260   1264                     1                  0
268   1292                     1                  0
316   1508                     1                  0
322   1535                     0                  1
413   1953                     0                  1
573   2732                     0                  1
594   2828                     0                  1
706   3456                     1                  0
763   3747                     1                  0
782   3815  

In [11]:
import pandas as pd

# Load the two CSV files
df_512 = pd.read_csv("final_test_512.csv")
df_1 = pd.read_csv("final_1.csv")

# Merge both on 'ID'
merged_df = df_512.merge(df_1, on='ID', suffixes=('_512', '_1'))

# Find mismatched predictions
mismatched = merged_df[
    merged_df['predicted_label_512'] != merged_df['predicted_label_1']
]

# Show the mismatches
print(mismatched)

# Save to CSV
mismatched.to_csv("mismatched_512_vs_1.csv", index=False)


        ID  predicted_label_512  predicted_label_1
8       58                    1                  0
24     161                    1                  0
43     234                    1                  0
101    513                    1                  0
121    597                    1                  0
127    648                    0                  1
142    725                    1                  0
163    812                    1                  0
198    976                    0                  1
229   1122                    1                  0
254   1226                    1                  0
260   1264                    1                  0
274   1314                    1                  0
277   1343                    1                  0
287   1385                    1                  0
317   1509                    1                  0
333   1585                    1                  0
338   1615                    1                  0
339   1618                    0

In [12]:
import pandas as pd

# Load the two CSV files
df_2048 = pd.read_csv("final_test_2048.csv")
df_3 = pd.read_csv("final_3.csv")

# Merge both on 'ID'
merged_df = df_2048.merge(df_3, on='ID', suffixes=('_2048', '_3'))

# Find mismatched predictions
mismatched = merged_df[
    merged_df['predicted_label_2048'] != merged_df['predicted_label_3']
]

# Show the mismatches
print(mismatched)

# Save to CSV
mismatched.to_csv("mismatched_2048_vs_3.csv", index=False)


        ID  predicted_label_2048  predicted_label_3
0       13                     0                  1
30     198                     1                  0
65     348                     0                  1
170    864                     1                  0
258   1258                     1                  0
259   1262                     1                  0
300   1449                     0                  1
319   1526                     1                  0
321   1530                     0                  1
344   1637                     1                  0
362   1720                     1                  0
369   1748                     0                  1
379   1781                     0                  1
529   2527                     0                  1
549   2602                     1                  0
573   2732                     1                  0
587   2809                     0                  1
725   3565                     1                  0
730   3630  

In [13]:
import pandas as pd

# Load the CSV files
df1 = pd.read_csv("final_1.csv")
df2 = pd.read_csv("final_2.csv")
df3 = pd.read_csv("final_3.csv")

# Merge all three on 'ID'
merged_df = df1.merge(df2, on='ID', suffixes=('_1', '_2'))
merged_df = merged_df.merge(df3, on='ID')
merged_df = merged_df.rename(columns={'predicted_label': 'predicted_label_3'})

# Identify mismatches
mismatched = merged_df[
    (merged_df['predicted_label_1'] != merged_df['predicted_label_2']) |
    (merged_df['predicted_label_1'] != merged_df['predicted_label_3']) |
    (merged_df['predicted_label_2'] != merged_df['predicted_label_3'])
]

# Function to find majority label and its count
def get_majority_info(row):
    labels = [
        row['predicted_label_1'],
        row['predicted_label_2'],
        row['predicted_label_3']
    ]
    majority_label = max(set(labels), key=labels.count)
    majority_count = labels.count(majority_label)
    return pd.Series([majority_label, majority_count], index=['majority_label', 'majority_count'])

# Apply to mismatched rows
mismatched[['majority_label', 'majority_count']] = mismatched.apply(get_majority_info, axis=1)

# Save mismatches to CSV
mismatched.to_csv("mismatched_final_1_2_3.csv", index=False)

# Optional: Print a sample
print(mismatched.head())


      ID  predicted_label_1  predicted_label_2  predicted_label_3  \
4   6011                  0                  1                  0   
9   1554                  1                  0                  0   
21   306                  0                  1                  0   
22  4732                  0                  1                  0   
24  6128                  1                  1                  0   

    majority_label  majority_count  
4                0               2  
9                0               2  
21               0               2  
22               0               2  
24               1               2  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatched[['majority_label', 'majority_count']] = mismatched.apply(get_majority_info, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatched[['majority_label', 'majority_count']] = mismatched.apply(get_majority_info, axis=1)


In [14]:
import pandas as pd

# Load all 6 CSVs
df1 = pd.read_csv("final_1.csv")
df2 = pd.read_csv("final_2.csv")
df3 = pd.read_csv("final_3.csv")
df512 = pd.read_csv("final_test_512.csv")
df1024 = pd.read_csv("final_test_1024.csv")
df2048 = pd.read_csv("final_test_2048.csv")

# Rename predicted_label columns for clarity
df1 = df1.rename(columns={'predicted_label': 'label_1'})
df2 = df2.rename(columns={'predicted_label': 'label_2'})
df3 = df3.rename(columns={'predicted_label': 'label_3'})
df512 = df512.rename(columns={'predicted_label': 'label_512'})
df1024 = df1024.rename(columns={'predicted_label': 'label_1024'})
df2048 = df2048.rename(columns={'predicted_label': 'label_2048'})

# Merge all dataframes on 'ID'
merged_df = df1.merge(df2, on='ID') \
               .merge(df3, on='ID') \
               .merge(df512, on='ID') \
               .merge(df1024, on='ID') \
               .merge(df2048, on='ID')

# Find rows with mismatched predictions
mismatched = merged_df[
    (merged_df.nunique(axis=1) - 1) > 1  # exclude 'ID' column from uniqueness check
]

# Compute majority label and count
def get_majority_info(row):
    labels = [row['label_1'], row['label_2'], row['label_3'], row['label_512'], row['label_1024'], row['label_2048']]
    majority_label = max(set(labels), key=labels.count)
    majority_count = labels.count(majority_label)
    return pd.Series([majority_label, majority_count], index=['majority_label', 'majority_count'])

# Add columns
mismatched[['majority_label', 'majority_count']] = mismatched.apply(get_majority_info, axis=1)

# Save result
mismatched.to_csv("mismatched_predictions_all6.csv", index=False)

print("✅ Comparison complete. Mismatches saved to 'mismatched_predictions_all6.csv'.")


✅ Comparison complete. Mismatches saved to 'mismatched_predictions_all6.csv'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatched[['majority_label', 'majority_count']] = mismatched.apply(get_majority_info, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatched[['majority_label', 'majority_count']] = mismatched.apply(get_majority_info, axis=1)


In [18]:
import pandas as pd

# Load the CSV file
hi = pd.read_csv("mismatched_predictions_all6.csv")

# Get count of each unique majority_count value
count_distribution = hi['majority_count'].value_counts().sort_index()

print(count_distribution)


majority_count
3     24
4    184
5     76
Name: count, dtype: int64


In [19]:
import pandas as pd

# Load the mismatched predictions
df = pd.read_csv("mismatched_predictions_all6.csv")

# Create a new 'label' column based on your condition
df['label'] = df.apply(
    lambda row: row['majority_label'] if row['majority_count'] > 3 else row['label_3'],
    axis=1
)

# Save the updated file
df.to_csv("mismatched_predictions_all6_updated.csv", index=False)

print("✅ Updated 'label' column saved to 'mismatched_predictions_all6_updated.csv'")


✅ Updated 'label' column saved to 'mismatched_predictions_all6_updated.csv'


In [20]:
import pandas as pd

# Load the updated mismatched predictions
df = pd.read_csv("mismatched_predictions_all6_updated.csv")

# Keep only 'ID' and 'label' columns
df_cleaned = df[['ID', 'label']]

# Save the final result
df_cleaned.to_csv("mismatched_predictions_all6_updated.csv", index=False)

print("✅ Saved cleaned file with only 'ID' and 'label' to 'final_labels_only.csv'")


✅ Saved cleaned file with only 'ID' and 'label' to 'final_labels_only.csv'


In [21]:
import pandas as pd

# Load the 6 CSVs
df1 = pd.read_csv("final_1.csv").rename(columns={'predicted_label': 'label_1'})
df2 = pd.read_csv("final_2.csv").rename(columns={'predicted_label': 'label_2'})
df3 = pd.read_csv("final_3.csv").rename(columns={'predicted_label': 'label_3'})
df512 = pd.read_csv("final_test_512.csv").rename(columns={'predicted_label': 'label_512'})
df1024 = pd.read_csv("final_test_1024.csv").rename(columns={'predicted_label': 'label_1024'})
df2048 = pd.read_csv("final_test_2048.csv").rename(columns={'predicted_label': 'label_2048'})

# Merge on ID
merged = df1.merge(df2, on='ID') \
            .merge(df3, on='ID') \
            .merge(df512, on='ID') \
            .merge(df1024, on='ID') \
            .merge(df2048, on='ID')

# Function to compute majority label or use label_3 if count == 3
def get_final_label(row):
    labels = [row['label_1'], row['label_2'], row['label_3'], row['label_512'], row['label_1024'], row['label_2048']]
    majority_label = max(set(labels), key=labels.count)
    count = labels.count(majority_label)
    return majority_label if count > 3 else row['label_3']

# Apply to get final label
merged['label'] = merged.apply(get_final_label, axis=1)

# Keep only ID and final label
final_df = merged[['ID', 'label']]

# Save to CSV
final_df.to_csv("final_final.csv", index=False)

print("✅ Saved final output to 'final_final.csv' with majority voting logic.")


✅ Saved final output to 'final_final.csv' with majority voting logic.


In [26]:
import pandas as pd

# Load final_final
df_final = pd.read_csv("final_final.csv").rename(columns={'label': 'label_final'})

# List of comparison files
comparison_files = ["final_1.csv", "final_2.csv", "final_3.csv"]

for i, file in enumerate(comparison_files, start=1):
    df_compare = pd.read_csv(file).rename(columns={'predicted_label': f'label_file{i}'})
    
    # Merge and compare
    comparison_df = df_final.merge(df_compare, on='ID')
    mismatches = comparison_df[comparison_df['label_final'] != comparison_df[f'label_file{i}']]
    
    # Save mismatches
    mismatches.to_csv(f"mismatched_final_vs_file{i}.csv", index=False)
    
    # Show summary
    print(f"Comparison with final_{i}.csv:")
    print(f"Total mismatches: {mismatches.shape[0]}")
    print(mismatches.head(), "\n")


Comparison with final_1.csv:
Total mismatches: 129
       ID  label_final  label_file1
9    1554            0            1
93     66            0            1
106  7850            0            1
123  5896            0            1
124  1908            0            1 

Comparison with final_2.csv:
Total mismatches: 63
      ID  label_final  label_file2
4   6011            0            1
21   306            0            1
22  4732            0            1
26  2527            1            0
51  7354            0            1 

Comparison with final_3.csv:
Total mismatches: 57
      ID  label_final  label_file3
24  6128            1            0
54  6701            1            0
62  2207            1            0
66    13            0            1
75  3716            1            0 



In [27]:
import pandas as pd

# Load final_final
df_final = pd.read_csv("final_final.csv").rename(columns={'label': 'label_final'})

# List of comparison files
comparison_files = ["final_1.csv", "final_2.csv", "final_3.csv"]

for i, file in enumerate(comparison_files, start=1):
    df_compare = pd.read_csv(file).rename(columns={'predicted_label': f'label_file{i}'})
    
    # Merge and compare
    comparison_df = df_final.merge(df_compare, on='ID')
    mismatches = comparison_df[comparison_df['label_final'] != comparison_df[f'label_file{i}']]
    
    # Save mismatches
    mismatches.to_csv(f"mismatched_final_vs_file{i}.csv", index=False)
    
    # Show summary
    print(f"Comparison with final_{i}.csv:")
    print(f"Total mismatches: {mismatches.shape[0]}")
    print("Mismatched IDs:", mismatches['ID'].unique())  # Print unique mismatched IDs
    print(mismatches.head(), "\n")


Comparison with final_1.csv:
Total mismatches: 129
Mismatched IDs: [1554   66 7850 5896 1908 7160 7801 6196 1250 3896 4504 1614 1196 1703
 1316 2638 1752 2794 6059 5456 1535 1722  304 7350 7701 1363 3209 1778
 7810 4794 1264  597 5138 6494 1637  471 1385  374 6934 1479 1870   58
 7163 1135 7227 7610  648 3768 5768 1449 2504 1109 7275 2602 1068 1387
 1015 1199 6370  523 1349 4638 6354 3204 1476 3536 7843 1696  601 6086
  804 1690 1856 4154 7331 2828 4455 7053 5204 6338  205 6173  780  944
  161 4304 1851 1952 6915 1618 1788  413 2304 3648 6047 7017 1537 7335
 7558 6221  568 7401 5209 6204 6302 7269 3138 1791 1955 4602  506 7721
 1271 1508 5536 5648 4693 6561  565 7221 3456 7042 1956 2693 7270  976
 7343  812  442]
       ID  label_final  label_file1
9    1554            0            1
93     66            0            1
106  7850            0            1
123  5896            0            1
124  1908            0            1 

Comparison with final_2.csv:
Total mismatches: 63
Mismatche

In [10]:
import pandas as pd

# Load the CSV files
test_df = pd.read_csv("test.csv")
final_df = pd.read_csv("final_3.csv", header=None, names=["id", "label"])

# Merge on 'id'
merged_df = test_df.merge(final_df, on="id", how="left")

# Save the updated test.csv with label
merged_df.to_csv("test_with_labels.csv", index=False)


In [11]:
import pandas as pd

# Load the datasets
test_df = pd.read_csv("test_with_labels.csv")
train_df = pd.read_csv("train.csv")

# Strip and normalize text to avoid mismatch due to whitespace
test_df['text'] = test_df['text'].str.strip()
train_df['text'] = train_df['text'].str.strip()

# Find common text
common_texts = pd.merge(test_df, train_df, on='text', how='inner', suffixes=('_test', '_train'))

# Save or display the common samples
common_texts.to_csv("common_text_samples.csv", index=False)
print(common_texts[['id_test', 'text', 'label_test', 'id_train', 'label_train']])


     id_test                                               text  label_test  \
0       3593               Avanga Holi kondada oruku poranga da           0   
1       4619  Bro vazhurathuku vazhi illamal sontha pathanka...           0   
2       4902  @prasanna8990  do some study about state wise ...           0   
3       3637  Avan avan kudumbatha kapatha velaiku varangha ...           0   
4       3742  Tamil nadala velai venumna unga wife uh engalu...           1   
..       ...                                                ...         ...   
642     4846  இந்திகாரனுக்களை தமிழ்பொண்கள்  திருமணம் செய்ய வ...           1   
643     4630  Ommela dai potti avanuga kittala poii kelvi ke...           1   
644     5401  Tamil makkalaukke ungalala padhukappu kudukka ...           1   
645     4270  சேட் என்ற சொல்லுக்கு முதலாளி என்று பொருள்.\nஅத...           1   
646     2526              தமிழ் மொழி எங்களுக்கு முக்கியம் தம்பி           1   

     id_train  label_train  
0        5593         

In [12]:
import pandas as pd

# Load CSVs
test_df = pd.read_csv("test_with_labels.csv")
train_df = pd.read_csv("train.csv")

# Clean whitespace
test_df['text'] = test_df['text'].str.strip()
train_df['text'] = train_df['text'].str.strip()

# Merge on text
merged = pd.merge(test_df, train_df, on='text', suffixes=('_test', '_train'))

# Filter rows with different labels
label_mismatch = merged[merged['label_test'] != merged['label_train']]

# Display the mismatched label samples
print(label_mismatch[['id_test', 'text', 'label_test', 'id_train', 'label_train']])

# Optional: save to CSV
label_mismatch.to_csv("label_mismatch_samples.csv", index=False)


     id_test                                               text  label_test  \
50      2089  தமிழர்களிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள...           0   
54      2089  தமிழர்களிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள...           0   
303     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள வே...           1   
304     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள வே...           1   
306     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள வே...           1   
307     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள வே...           1   
308     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள வே...           1   
309     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள வே...           1   
311     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள வே...           1   
312     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள வே...           1   
313     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் கொள்ள வே...           1   
314     1781  தமிழரிடம் மட்டுமே வரவு செலவு வைத்துக் 

In [13]:
import pandas as pd

# Load the CSVs
test_df = pd.read_csv("test_with_labels.csv")
train_df = pd.read_csv("train.csv")

# Strip and clean the text columns
test_df['text'] = test_df['text'].str.strip()
train_df['text'] = train_df['text'].str.strip()

# Merge on 'text' to bring in train labels
merged = pd.merge(test_df, train_df[['text', 'label']], on='text', how='left', suffixes=('', '_train'))

# Where there's a mismatch, use train label
merged['label'] = merged.apply(
    lambda row: row['label_train'] if pd.notna(row['label_train']) and row['label'] != row['label_train'] else row['label'],
    axis=1
)

# Drop the extra train label column
merged = merged.drop(columns=['label_train'])

# Save to new CSV
merged.to_csv("test_updated_labels.csv", index=False)
