### Import Necessary Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import re
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

nltk.download('stopwords')

### Load Dataset


In [None]:
df = pd.read_csv('Language Detection.csv')

### Display Head


In [None]:
df.head()

### Data Info


In [None]:
df.info()

### Data Summary


In [None]:
df.describe()

### Languages Count


In [None]:
language_counts = df['Language'].value_counts()
print(language_counts)

### Bar Chart for Language Distribution


In [None]:
language_counts = df['Language'].value_counts()
plt.figure(figsize=(10, 6))
language_counts.plot(kind='bar', color='skyblue')
plt.title('Language Distribution')
plt.xlabel('Language')
plt.ylabel('Count')
plt.show()

### Word Clouds for Each Language


In [None]:
for language in df['Language'].unique():
    text = ' '.join(df[df['Language'] == language]['Text'])
    wordcloud = WordCloud(width=800, height=400,
                          background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {language}')
    plt.axis('off')
    plt.show()

### Histogram of Text Length


In [None]:
df['Text Length'] = df['Text'].apply(len)
plt.figure(figsize=(10, 6))
plt.hist(df['Text Length'], bins=50, color='lightcoral', edgecolor='black')
plt.title('Histogram of Text Length')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

### Box Plot of Text Length by Language


In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x='Language', y='Text Length', data=df, palette='viridis')
plt.title('Box Plot of Text Length by Language')
plt.xlabel('Language')
plt.ylabel('Text Length')
plt.show()

### Pie Chart for Language Proportions


In [None]:
plt.figure(figsize=(8, 8))
df['Language'].value_counts().plot.pie(autopct='%1.1f%%', colors=[
    'lightblue', 'lightgreen', 'lightcoral'])
plt.title('Language Proportions')
plt.show()

### Scatter Plot for Text Length vs. Language


In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(df['Text Length'], df['Language'], alpha=0.5, color='darkblue')
plt.title('Scatter Plot of Text Length vs. Language')
plt.xlabel('Text Length')
plt.ylabel('Language')
plt.show()

### Filter Data on Languages


In [None]:
selected_languages = ['English', 'French']
filtered_df = df[df['Language'].isin(selected_languages)]
filtered_df

### Removing Stopwords


In [None]:
ps = PorterStemmer()
corpus = []

for i in range(0, len(filtered_df)):
    review = re.sub('[^a-zA-Z]', ' ', filtered_df['Text'].iloc[i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word)
              for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    # print(f'{i}')

In [None]:
# print(corpus)

### Convert Sentences into Vector


In [None]:
cv = CountVectorizer(max_features=10000)
X = cv.fit_transform(corpus).toarray()

In [None]:
X.shape

### Label Encoding


In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(filtered_df['Language'])
y

In [None]:
len(y)

In [None]:
label_encoder.classes_

### Final Data


In [None]:
final_df = pd.DataFrame(np.c_[corpus, y], columns=['Text', 'Language'])

In [None]:
final_df

### Split Training and Testing Data


In [None]:
# Assuming final_df contains your data
X = final_df['Text']
y = final_df['Language']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

### Define Model


In [None]:
# Create a pipeline with CountVectorizer and MultinomialNB
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Fit the model on the training data
model.fit(X_train, y_train)

### Test Model on Test Data


In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
y_test

### Evalute the Accuracy of the Model


In [None]:
# Evaluate the accuracy of the model on the testing data
accuracy = accuracy_score(y_test, y_pred, normalize=True)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

### Classification Report


In [None]:
class_report = classification_report(
    y_test, y_pred, target_names=label_encoder.classes_)
print("Classification Report:\n", class_report)

### Confusion Matrix


In [None]:
# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Display the confusion matrix using seaborn heatmap
plt.figure(figsize=(20, 20))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap=plt.cm.Accent,
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

### Actual and Predicted


In [None]:
# Create a DataFrame with actual and predicted labels
results_df = pd.DataFrame(
    {'Actual Language': y_test, 'Predicted Language': y_pred})

# Display the DataFrame
results_df

### Save Model


In [None]:
# # Save the model to a file
# model_filename = 'eng_vs_french_classify.sav'
# joblib.dump((model, cv, label_encoder), model_filename)

### Load the Model


In [None]:
# # Load the model from the file
# loaded_model, loaded_vectorizer, loaded_label_encoder = joblib.load(
#     model_filename)

### Function to Test the Model


### Test the Function
