<a href="https://colab.research.google.com/github/Hiten-0710/Identify-fake-job-postings/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import io

# Function to preprocess the data
def preprocess_data(data):
    # Convert CSV string to DataFrame
    df = pd.read_csv(io.StringIO(data))

    # Handle missing or invalid values
    df = df.dropna()  # Drop rows with missing values

    # Convert has_company_profile to numeric (1 for True, 0 for False)
    df['has_company_profile'] = df['has_company_profile'].astype(int)

    # Convert is_fake to numeric (1 for 'yes', 0 for 'no')
    df['is_fake'] = df['is_fake'].str.lower().map({'yes': 1, 'no': 0})

    # Ensure numeric columns are of correct type
    df['title_length'] = pd.to_numeric(df['title_length'], errors='coerce')
    df['description_length'] = pd.to_numeric(df['description_length'], errors='coerce')

    # Drop any rows with invalid numeric values
    df = df.dropna()

    return df

# Function to train the model
def train_model(df):
    # Features and target
    X = df[['title_length', 'description_length', 'has_company_profile']]
    y = df['is_fake']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")

    return model

# Function to get user input and predict
def predict_job_posting(model):
    print("\nEnter job posting details:")

    # Get title length
    while True:
        try:
            title_length = int(input("Title length (number of characters): "))
            if title_length < 0:
                print("Title length cannot be negative.")
                continue
            break
        except ValueError:
            print("Please enter a valid number.")

    # Get description length
    while True:
        try:
            description_length = int(input("Description length (number of characters): "))
            if description_length < 0:
                print("Description length cannot be negative.")
                continue
            break
        except ValueError:
            print("Please enter a valid number.")

    # Get company profile status
    while True:
        has_profile = input("Does the posting have a company profile? (yes/no): ").strip().lower()
        if has_profile in ['yes', 'no']:
            has_profile = 1 if has_profile == 'yes' else 0
            break
        print("Please enter 'yes' or 'no'.")

    # Make prediction
    features = [[title_length, description_length, has_profile]]
    prediction = model.predict(features)[0]
    probability = model.predict_proba(features)[0][1]

    # Display result
    result = "Fake" if prediction == 1 else "Real"
    print(f"\nPrediction: This job posting is {result}")
    print(f"Probability of being fake: {probability:.2%}")

def main():
    # Load the data (in practice, this would be read from a file or provided as input)
    # For this example, the data is assumed to be available as a string
    data = """title_length,description_length,has_company_profile,is_fake
72,740,1,yes
95,476,0,no
60,662,1,yes
34,317,0,no
67,884,0,yes
72,626,1,no
71,466,0,no
31,217,0,no
67,891,1,yes
67,92,0,no
95,605,0,yes
58,334,0,yes
61,446,0,yes
51,61,1,yes
79,656,1,no
24,351,0,no
63,947,1,yes
69,302,0,yes
17,548,1,no
62,803,0,yes
69,84,0,no
14,776,0,yes
77,898,0,yes
15,139,0,no
56,825,0,yes
64,654,1,no
49,971,0,yes
61,651,1,yes
25,467,0,yes
22,164,1,no
39,666,1,yes
28,952,1,yes
26,245,0,yes
72,875,0,yes
28,550,1,no
67,675,0,no
64,542,1,yes
99,124,0,no
99,462,1,yes
71,425,1,yes
32,469,0,yes
18,778,1,no
21,326,0,no
10,810,1,no
67,725,1,no
10,443,0,no
43,918,0,no
57,506,0,no
98,969,1,no
10,241,0,no
25,788,1,yes
70,738,1,no
73,148,0,no
72,597,0,no
78,145,1,no
31,713,1,no
76,712,0,no
85,239,0,no
35,785,0,no
25,86,0,no
60,829,0,no
95,418,1,no
66,744,1,yes
38,574,1,yes
87,328,1,no
78,266,0,no
56,916,0,yes
71,922,0,no
78,847,1,yes
85,322,1,yes
25,930,0,yes
99,111,0,yes
99,645,1,yes
57,929,1,no
94,778,0,yes
48,391,0,no
42,446,1,no
32,748,0,yes
19,68,0,yes
78,226,0,no
43,661,1,yes
61,445,1,yes
19,494,1,no
28,282,1,no
67,964,1,yes
10,125,1,no
78,314,1,yes
13,504,0,yes
25,845,0,yes
33,767,0,no
89,784,0,no
11,433,0,no
41,613,0,no
93,900,0,no
33,555,1,yes
21,416,1,yes
59,193,0,yes
44,934,0,yes
42,118,1,no
42,148,0,yes"""

    # Preprocess data
    df = preprocess_data(data)

    # Train model
    model = train_model(df)

    # User interaction loop
    while True:
        predict_job_posting(model)
        again = input("\nWould you like to classify another job posting? (yes/no): ").strip().lower()
        if again != 'yes':
            print("Goodbye!")
            break

if __name__ == "__main__":
    main()

Model Accuracy: 0.50

Enter job posting details:
Title length (number of characters): 72
Description length (number of characters): 150
Does the posting have a company profile? (yes/no): yes





Prediction: This job posting is Real
Probability of being fake: 18.00%

Would you like to classify another job posting? (yes/no): no
Goodbye!
