In [5]:
import pandas as pd

# Load dataset
df = pd.read_csv(
    'Train.txt',
    sep='\t',
    names=['Token', 'Tag'],
    dtype=str,
    keep_default_na=False,
    on_bad_lines='skip',
    encoding='utf-8'
)

# Strip whitespace
df['Token'] = df['Token'].str.strip()
df['Tag'] = df['Tag'].str.strip()

# Create a blank line for rows where Token is '*'
blank_line = pd.DataFrame([{'Token': '', 'Tag': ''}])

# Build a new DataFrame with '*' replaced by blank lines
cleaned_rows = []
for _, row in df.iterrows():
    if row['Token'] == '*':
        cleaned_rows.append(blank_line.iloc[0])
    else:
        cleaned_rows.append(row)

# Create DataFrame
df_cleaned = pd.DataFrame(cleaned_rows)

# Save to new file
df_cleaned.to_csv('Train_CLeaned.txt', sep='\t', index=False, header=False, encoding='utf-8')

print("✅ All '*' replaced with blank lines and saved to 'Train_CLeaned.txt'.")


✅ All '*' replaced with blank lines and saved to 'Train_CLeaned.txt'.


In [5]:
import pandas as pd

# Load dataset while skipping malformed lines
try:
    df = pd.read_csv(
        'Test.txt', 
        sep='\t', 
        names=['Token', 'Tag'], 
        on_bad_lines='skip',  # Skip lines with ≠2 columns
        encoding='utf-8'
    )
    
    # Remove rows with '*' tokens
    df_clean = df[df['Token'] != '*']
    
    # Save cleaned data
    df_clean.to_csv('cleaned_Test.txt', sep='\t', index=False, header=False)
    print("Cleaning successful!")

except FileNotFoundError:
    print("Error: 'Test.txt' not found. Check the file path!")
except Exception as e:
    print(f"Unexpected error: {e}")


Cleaning successful!
