## NAME : Mohamed Mousa

## Language Detection Notebook

### importing libraries 

In [1]:
import pandas as pd
import numpy as np
import pickle
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
import pickle
from nltk.corpus import stopwords
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import warnings
warnings.simplefilter("ignore")



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### load data

In [2]:
data = pd.read_csv("Language_det_train.csv")

### Cleaning Text from any (special characters ,the symbols , numbers , ...........)

In [3]:
stemmer = WordNetLemmatizer()

def clean_txt(x):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(x))
    # removing the symbols and numbers
    document = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', document)
    document = re.sub(r'[[]]', ' ', document)
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    # Converting to Lowercase
    document = document.lower()
    return document

In [4]:
## loop on text data to clean by apply & lambda Func
data['Text'] = data['Text'].apply(lambda x:clean_txt(x))

### spliting data to Input & target

In [6]:
X = data['Text']
y = data['Language']

### spliting data to train & test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72 , stratify=y)

In [8]:
### Label encoding for Y target to  transform categorical labels into numerical values 
### By assigning a unique numerical identifier to each category

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train) 
y_test_encoded = label_encoder.transform(y_test)

In [10]:
# Utilize TF-IDF vectorization to convert text data into numerical representations.
# This technique captures the importance of words in distinguishing between different languages
###    , enhancing the model's language detection capabilities.


# Initialize TF-IDF vectorizer with character-level analysis and n-gram range of 1 to 3.
tfidfvectorizer = TfidfVectorizer( analyzer='char' , ngram_range=(1,3) )

In [25]:
# Create a pipeline to streamline the text processing and classification workflow.
# The 'TF-idf' step uses the TF-IDF vectorizer initialized earlier to convert text data into numerical representations.
# The 'LR' step utilizes Logistic Regression as the classification algorithm to predict the language of the input text.


pipeline = Pipeline([
    ('TF-idf', tfidfvectorizer),  
    ('LR', LogisticRegression())    
])

In [27]:
# Fit the pipeline to the training data, X_train and y_train_encoded.
# This step trains the TF-IDF vectorizer on the training text data and then fits the Logistic Regression classifier to learn the language patterns.
# Once trained, the pipeline is capable of transforming and classifying new text data.
# Predict the language labels for the test data, X_test, using the trained pipeline.
# The predicted labels are stored in y_pred for further evaluation of the model's performance.


pipeline.fit(X_train, y_train_encoded)
y_pred = pipeline.predict(X_test)

- Calculate precision, recall, and accuracy to evaluate the language detection model.
- Precision measures the accuracy of positive predictions, while recall assesses the model's ability to capture all instances of each language.
- These metrics are valuable even in balanced datasets, providing insights into the model's performance across all languages.
- Accuracy provides an overall measure of correctness, complementing precision and recall in evaluating model performance.


In [29]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test_encoded, y_pred)
print("Accuracy:", accuracy)

# Classification report
print(classification_report(y_test_encoded, y_pred))

Accuracy: 0.9831975560081466
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       102
           1       0.95      0.93      0.94        81
           2       0.95      0.99      0.97       104
           3       0.98      0.99      0.98       263
           4       0.98      0.99      0.99       193
           5       0.99      0.96      0.97        89
           6       1.00      1.00      1.00        69
           7       1.00      1.00      1.00        12
           8       0.99      0.98      0.98       133
           9       1.00      1.00      1.00        70
          10       1.00      1.00      1.00       113
          11       0.98      0.96      0.97       140
          12       1.00      0.99      1.00       132
          13       0.97      0.98      0.97       156
          14       0.98      0.96      0.97       128
          15       1.00      1.00      1.00        89
          16       0.99      1.00      0.99        9

In [30]:
pipeline

In [76]:
import joblib
# Save the pipeline to a file
joblib.dump(pipeline, 'language_detection_pipeline.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [35]:
loaded_pipeline = joblib.load('language_detection_pipeline.pkl')

# Use the loaded pipeline for predictions
loaded_predictions = loaded_pipeline.predict(X_test)

### Testing Model in a Real Data

- language detection model can classify more than minimum languages

In [73]:
import time

TextData = 'it was disco ti ued i february'
arabic_Text = ' عزا فريق في مركز بالو ألتو للأبحاث هذا التباطؤ في النمو إلى التفرد المتزايد للمشروع ومقاومته للتغيير '
portugeese_text =  'a wikipédia recebe e tre e pedidos de pági a por segu do depe de do da hora do dia ' 


# Clean text input
cleaned_text = clean_txt(portugeese_text)
start_time = time.time()
# Make prediction using the loaded pipeline
predicted_language_encoded = pipeline.predict([cleaned_text])[0]
end_time = time.time()
# Inverse transform the predicted label
predicted_language = label_encoder.inverse_transform([predicted_language_encoded])

# Calculate the time taken for prediction
prediction_time = end_time - start_time
print("Time taken for prediction:", prediction_time, "seconds")
print(predicted_language)

Time taken for prediction: 0.026526689529418945 seconds
['Portugeese']


### Building app.py file for creating Apis

In [118]:
%%writefile app.py
import uvicorn
import joblib
from fastapi import FastAPI ,HTTPException
from pydantic import BaseModel
import joblib
import re
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AdamWeightDecay, TFAutoModelForSeq2SeqLM

pipeline = joblib.load('language_detection_pipeline.pkl')
label_encoder = joblib.load('label_encoder.pkl')

# Load the pre-trained model and tokenizer

model_eng2ar = TFAutoModelForSeq2SeqLM.from_pretrained("D:\Kemet NLP\en2ar_model")
tokenizer_eng2ar = AutoTokenizer.from_pretrained("D:\Kemet NLP\en2ar_tok")
 
model_ar2eng = TFAutoModelForSeq2SeqLM.from_pretrained("D:\Kemet NLP\ar2eng_model")
tokenizer_ar2eng =AutoTokenizer.from_pretrained("D:\Kemet NLP\ar2eng_model_tok")




def predict_pipeline(text):
    # Remove all the special characters
    text = re.sub(r'\W', ' ', str(text))
    # removing the symbols and numbers
    text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
    text = re.sub(r'\[\]', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    # Converting to Lowercase
    text = text.lower()
    
    predicted_language_encoded = pipeline.predict([text])[0]
    
    return predicted_language_encoded

def translate_eng2ar(clean_text):
    input_text =  clean_text
    inputs = tokenizer_eng2ar(input_text, return_tensors="pt").input_ids
    outputs = model_eng2ar.generate(inputs, max_length=64)
    translated_text = tokenizer_eng2ar.decode(outputs[0], skip_special_tokens=True)

    return translated_text

def translate_ar2eng(clean_text):
    input_text =  clean_text
    inputs = tokenizer_ar2eng(input_text, return_tensors="pt").input_ids
    outputs = model_ar2eng.generate(inputs, max_length=64)
    translated_text = tokenizer_ar2eng.decode(outputs[0], skip_special_tokens=True)

    return translated_text





class TextIn(BaseModel):
    TextIn: str

class PredictionOut(BaseModel):
    language: str
        
class Translation(BaseModel):
    language: str        
        

        
        
app = FastAPI()




@app.get("/")
def home():
    return {"health_check": "OK"}


@app.post("/predict", response_model=PredictionOut)
def predict_language(payload: TextIn):
    try:
        if not payload.TextIn.strip():
            raise HTTPException(status_code=400, detail="Empty text provided")
        
        predicted_language_encoded = predict_pipeline(payload.TextIn)
        predicted_language = label_encoder.inverse_transform([predicted_language_encoded])[0]
        predicted_language_str = str(predicted_language)  # Convert to string if necessary
        
        return {"language": predicted_language_str}
    
    except Exception as e:
        # Log the error
        print(f"An error occurred: {str(e)}")
        # Return an error response
        raise HTTPException(status_code=500, detail="Internal Server Error")

        
@app.post("/translation/" , response_model=Translation)
async def translate_text(text_data: TextIn):
    try:
        clean_text = text_data.TextIn.strip()
        if not clean_text:
            raise HTTPException(status_code=400, detail="Empty text provided")
        
        predicted_language_encoded = predict_pipeline(clean_text)
        predicted_language = label_encoder.inverse_transform([predicted_language_encoded])[0]
        predicted_language_str = str(predicted_language)
        
        if predicted_language_str == 'English':
            translated_text = translate_eng2ar(clean_text)
        elif predicted_language_str == 'Arabic':
            translated_text = translate_ar2eng(clean_text)
        else:
            raise HTTPException(status_code=400, detail="Unsupported language. Only Arabic and English are supported.")
        
        return {"translation": translated_text}  # Return only the translated text
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise HTTPException(status_code=500, detail="Internal Server Error")

if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8000)

Overwriting app.py
